clang  7.0.0svn
avxintrin.h
Go to the documentation of this file.
1 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __IMMINTRIN_H
25 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
26 #endif
27 
28 #ifndef __AVXINTRIN_H
29 #define __AVXINTRIN_H
30 
31 typedef double __v4df __attribute__ ((__vector_size__ (32)));
32 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
33 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
34 typedef int __v8si __attribute__ ((__vector_size__ (32)));
35 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
36 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
37 
38 /* Unsigned types */
39 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
40 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
41 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
42 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
43 
44 /* We need an explicitly signed variant for char. Note that this shouldn't
45  * appear in the interface though. */
46 typedef signed char __v32qs __attribute__((__vector_size__(32)));
47 
48 typedef float __m256 __attribute__ ((__vector_size__ (32)));
49 typedef double __m256d __attribute__((__vector_size__(32)));
50 typedef long long __m256i __attribute__((__vector_size__(32)));
51 
52 /* Define the default attributes for the functions in this file. */
53 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
54 
55 /* Arithmetic */
56 /// Adds two 256-bit vectors of [4 x double].
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the <c> VADDPD </c> instruction.
61 ///
62 /// \param __a
63 /// A 256-bit vector of [4 x double] containing one of the source operands.
64 /// \param __b
65 /// A 256-bit vector of [4 x double] containing one of the source operands.
66 /// \returns A 256-bit vector of [4 x double] containing the sums of both
67 /// operands.
68 static __inline __m256d __DEFAULT_FN_ATTRS
69 _mm256_add_pd(__m256d __a, __m256d __b)
70 {
71  return (__m256d)((__v4df)__a+(__v4df)__b);
72 }
73 
74 /// Adds two 256-bit vectors of [8 x float].
75 ///
76 /// \headerfile <x86intrin.h>
77 ///
78 /// This intrinsic corresponds to the <c> VADDPS </c> instruction.
79 ///
80 /// \param __a
81 /// A 256-bit vector of [8 x float] containing one of the source operands.
82 /// \param __b
83 /// A 256-bit vector of [8 x float] containing one of the source operands.
84 /// \returns A 256-bit vector of [8 x float] containing the sums of both
85 /// operands.
86 static __inline __m256 __DEFAULT_FN_ATTRS
87 _mm256_add_ps(__m256 __a, __m256 __b)
88 {
89  return (__m256)((__v8sf)__a+(__v8sf)__b);
90 }
91 
92 /// Subtracts two 256-bit vectors of [4 x double].
93 ///
94 /// \headerfile <x86intrin.h>
95 ///
96 /// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
97 ///
98 /// \param __a
99 /// A 256-bit vector of [4 x double] containing the minuend.
100 /// \param __b
101 /// A 256-bit vector of [4 x double] containing the subtrahend.
102 /// \returns A 256-bit vector of [4 x double] containing the differences between
103 /// both operands.
104 static __inline __m256d __DEFAULT_FN_ATTRS
105 _mm256_sub_pd(__m256d __a, __m256d __b)
106 {
107  return (__m256d)((__v4df)__a-(__v4df)__b);
108 }
109 
110 /// Subtracts two 256-bit vectors of [8 x float].
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
115 ///
116 /// \param __a
117 /// A 256-bit vector of [8 x float] containing the minuend.
118 /// \param __b
119 /// A 256-bit vector of [8 x float] containing the subtrahend.
120 /// \returns A 256-bit vector of [8 x float] containing the differences between
121 /// both operands.
122 static __inline __m256 __DEFAULT_FN_ATTRS
123 _mm256_sub_ps(__m256 __a, __m256 __b)
124 {
125  return (__m256)((__v8sf)__a-(__v8sf)__b);
126 }
127 
128 /// Adds the even-indexed values and subtracts the odd-indexed values of
129 /// two 256-bit vectors of [4 x double].
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
134 ///
135 /// \param __a
136 /// A 256-bit vector of [4 x double] containing the left source operand.
137 /// \param __b
138 /// A 256-bit vector of [4 x double] containing the right source operand.
139 /// \returns A 256-bit vector of [4 x double] containing the alternating sums
140 /// and differences between both operands.
141 static __inline __m256d __DEFAULT_FN_ATTRS
142 _mm256_addsub_pd(__m256d __a, __m256d __b)
143 {
144  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
145 }
146 
147 /// Adds the even-indexed values and subtracts the odd-indexed values of
148 /// two 256-bit vectors of [8 x float].
149 ///
150 /// \headerfile <x86intrin.h>
151 ///
152 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
153 ///
154 /// \param __a
155 /// A 256-bit vector of [8 x float] containing the left source operand.
156 /// \param __b
157 /// A 256-bit vector of [8 x float] containing the right source operand.
158 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
159 /// differences between both operands.
160 static __inline __m256 __DEFAULT_FN_ATTRS
161 _mm256_addsub_ps(__m256 __a, __m256 __b)
162 {
163  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
164 }
165 
166 /// Divides two 256-bit vectors of [4 x double].
167 ///
168 /// \headerfile <x86intrin.h>
169 ///
170 /// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
171 ///
172 /// \param __a
173 /// A 256-bit vector of [4 x double] containing the dividend.
174 /// \param __b
175 /// A 256-bit vector of [4 x double] containing the divisor.
176 /// \returns A 256-bit vector of [4 x double] containing the quotients of both
177 /// operands.
178 static __inline __m256d __DEFAULT_FN_ATTRS
179 _mm256_div_pd(__m256d __a, __m256d __b)
180 {
181  return (__m256d)((__v4df)__a/(__v4df)__b);
182 }
183 
184 /// Divides two 256-bit vectors of [8 x float].
185 ///
186 /// \headerfile <x86intrin.h>
187 ///
188 /// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
189 ///
190 /// \param __a
191 /// A 256-bit vector of [8 x float] containing the dividend.
192 /// \param __b
193 /// A 256-bit vector of [8 x float] containing the divisor.
194 /// \returns A 256-bit vector of [8 x float] containing the quotients of both
195 /// operands.
196 static __inline __m256 __DEFAULT_FN_ATTRS
197 _mm256_div_ps(__m256 __a, __m256 __b)
198 {
199  return (__m256)((__v8sf)__a/(__v8sf)__b);
200 }
201 
202 /// Compares two 256-bit vectors of [4 x double] and returns the greater
203 /// of each pair of values.
204 ///
205 /// \headerfile <x86intrin.h>
206 ///
207 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
208 ///
209 /// \param __a
210 /// A 256-bit vector of [4 x double] containing one of the operands.
211 /// \param __b
212 /// A 256-bit vector of [4 x double] containing one of the operands.
213 /// \returns A 256-bit vector of [4 x double] containing the maximum values
214 /// between both operands.
215 static __inline __m256d __DEFAULT_FN_ATTRS
216 _mm256_max_pd(__m256d __a, __m256d __b)
217 {
218  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
219 }
220 
221 /// Compares two 256-bit vectors of [8 x float] and returns the greater
222 /// of each pair of values.
223 ///
224 /// \headerfile <x86intrin.h>
225 ///
226 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
227 ///
228 /// \param __a
229 /// A 256-bit vector of [8 x float] containing one of the operands.
230 /// \param __b
231 /// A 256-bit vector of [8 x float] containing one of the operands.
232 /// \returns A 256-bit vector of [8 x float] containing the maximum values
233 /// between both operands.
234 static __inline __m256 __DEFAULT_FN_ATTRS
235 _mm256_max_ps(__m256 __a, __m256 __b)
236 {
237  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
238 }
239 
240 /// Compares two 256-bit vectors of [4 x double] and returns the lesser
241 /// of each pair of values.
242 ///
243 /// \headerfile <x86intrin.h>
244 ///
245 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
246 ///
247 /// \param __a
248 /// A 256-bit vector of [4 x double] containing one of the operands.
249 /// \param __b
250 /// A 256-bit vector of [4 x double] containing one of the operands.
251 /// \returns A 256-bit vector of [4 x double] containing the minimum values
252 /// between both operands.
253 static __inline __m256d __DEFAULT_FN_ATTRS
254 _mm256_min_pd(__m256d __a, __m256d __b)
255 {
256  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
257 }
258 
259 /// Compares two 256-bit vectors of [8 x float] and returns the lesser
260 /// of each pair of values.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
265 ///
266 /// \param __a
267 /// A 256-bit vector of [8 x float] containing one of the operands.
268 /// \param __b
269 /// A 256-bit vector of [8 x float] containing one of the operands.
270 /// \returns A 256-bit vector of [8 x float] containing the minimum values
271 /// between both operands.
272 static __inline __m256 __DEFAULT_FN_ATTRS
273 _mm256_min_ps(__m256 __a, __m256 __b)
274 {
275  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
276 }
277 
278 /// Multiplies two 256-bit vectors of [4 x double].
279 ///
280 /// \headerfile <x86intrin.h>
281 ///
282 /// This intrinsic corresponds to the <c> VMULPD </c> instruction.
283 ///
284 /// \param __a
285 /// A 256-bit vector of [4 x double] containing one of the operands.
286 /// \param __b
287 /// A 256-bit vector of [4 x double] containing one of the operands.
288 /// \returns A 256-bit vector of [4 x double] containing the products of both
289 /// operands.
290 static __inline __m256d __DEFAULT_FN_ATTRS
291 _mm256_mul_pd(__m256d __a, __m256d __b)
292 {
293  return (__m256d)((__v4df)__a * (__v4df)__b);
294 }
295 
296 /// Multiplies two 256-bit vectors of [8 x float].
297 ///
298 /// \headerfile <x86intrin.h>
299 ///
300 /// This intrinsic corresponds to the <c> VMULPS </c> instruction.
301 ///
302 /// \param __a
303 /// A 256-bit vector of [8 x float] containing one of the operands.
304 /// \param __b
305 /// A 256-bit vector of [8 x float] containing one of the operands.
306 /// \returns A 256-bit vector of [8 x float] containing the products of both
307 /// operands.
308 static __inline __m256 __DEFAULT_FN_ATTRS
309 _mm256_mul_ps(__m256 __a, __m256 __b)
310 {
311  return (__m256)((__v8sf)__a * (__v8sf)__b);
312 }
313 
314 /// Calculates the square roots of the values in a 256-bit vector of
315 /// [4 x double].
316 ///
317 /// \headerfile <x86intrin.h>
318 ///
319 /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
320 ///
321 /// \param __a
322 /// A 256-bit vector of [4 x double].
323 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
324 /// values in the operand.
325 static __inline __m256d __DEFAULT_FN_ATTRS
326 _mm256_sqrt_pd(__m256d __a)
327 {
328  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
329 }
330 
331 /// Calculates the square roots of the values in a 256-bit vector of
332 /// [8 x float].
333 ///
334 /// \headerfile <x86intrin.h>
335 ///
336 /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
337 ///
338 /// \param __a
339 /// A 256-bit vector of [8 x float].
340 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
341 /// values in the operand.
342 static __inline __m256 __DEFAULT_FN_ATTRS
343 _mm256_sqrt_ps(__m256 __a)
344 {
345  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
346 }
347 
348 /// Calculates the reciprocal square roots of the values in a 256-bit
349 /// vector of [8 x float].
350 ///
351 /// \headerfile <x86intrin.h>
352 ///
353 /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
354 ///
355 /// \param __a
356 /// A 256-bit vector of [8 x float].
357 /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
358 /// roots of the values in the operand.
359 static __inline __m256 __DEFAULT_FN_ATTRS
360 _mm256_rsqrt_ps(__m256 __a)
361 {
362  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
363 }
364 
365 /// Calculates the reciprocals of the values in a 256-bit vector of
366 /// [8 x float].
367 ///
368 /// \headerfile <x86intrin.h>
369 ///
370 /// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
371 ///
372 /// \param __a
373 /// A 256-bit vector of [8 x float].
374 /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
375 /// values in the operand.
376 static __inline __m256 __DEFAULT_FN_ATTRS
377 _mm256_rcp_ps(__m256 __a)
378 {
379  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
380 }
381 
382 /// Rounds the values in a 256-bit vector of [4 x double] as specified
383 /// by the byte operand. The source values are rounded to integer values and
384 /// returned as 64-bit double-precision floating-point values.
385 ///
386 /// \headerfile <x86intrin.h>
387 ///
388 /// \code
389 /// __m256d _mm256_round_pd(__m256d V, const int M);
390 /// \endcode
391 ///
392 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
393 ///
394 /// \param V
395 /// A 256-bit vector of [4 x double].
396 /// \param M
397 /// An integer value that specifies the rounding operation. \n
398 /// Bits [7:4] are reserved. \n
399 /// Bit [3] is a precision exception value: \n
400 /// 0: A normal PE exception is used. \n
401 /// 1: The PE field is not updated. \n
402 /// Bit [2] is the rounding control source: \n
403 /// 0: Use bits [1:0] of \a M. \n
404 /// 1: Use the current MXCSR setting. \n
405 /// Bits [1:0] contain the rounding control definition: \n
406 /// 00: Nearest. \n
407 /// 01: Downward (toward negative infinity). \n
408 /// 10: Upward (toward positive infinity). \n
409 /// 11: Truncated.
410 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
411 #define _mm256_round_pd(V, M) \
412  (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
413 
414 /// Rounds the values stored in a 256-bit vector of [8 x float] as
415 /// specified by the byte operand. The source values are rounded to integer
416 /// values and returned as floating-point values.
417 ///
418 /// \headerfile <x86intrin.h>
419 ///
420 /// \code
421 /// __m256 _mm256_round_ps(__m256 V, const int M);
422 /// \endcode
423 ///
424 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
425 ///
426 /// \param V
427 /// A 256-bit vector of [8 x float].
428 /// \param M
429 /// An integer value that specifies the rounding operation. \n
430 /// Bits [7:4] are reserved. \n
431 /// Bit [3] is a precision exception value: \n
432 /// 0: A normal PE exception is used. \n
433 /// 1: The PE field is not updated. \n
434 /// Bit [2] is the rounding control source: \n
435 /// 0: Use bits [1:0] of \a M. \n
436 /// 1: Use the current MXCSR setting. \n
437 /// Bits [1:0] contain the rounding control definition: \n
438 /// 00: Nearest. \n
439 /// 01: Downward (toward negative infinity). \n
440 /// 10: Upward (toward positive infinity). \n
441 /// 11: Truncated.
442 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
443 #define _mm256_round_ps(V, M) \
444  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
445 
446 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
447 /// source values are rounded up to integer values and returned as 64-bit
448 /// double-precision floating-point values.
449 ///
450 /// \headerfile <x86intrin.h>
451 ///
452 /// \code
453 /// __m256d _mm256_ceil_pd(__m256d V);
454 /// \endcode
455 ///
456 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
457 ///
458 /// \param V
459 /// A 256-bit vector of [4 x double].
460 /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
461 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
462 
463 /// Rounds down the values stored in a 256-bit vector of [4 x double].
464 /// The source values are rounded down to integer values and returned as
465 /// 64-bit double-precision floating-point values.
466 ///
467 /// \headerfile <x86intrin.h>
468 ///
469 /// \code
470 /// __m256d _mm256_floor_pd(__m256d V);
471 /// \endcode
472 ///
473 /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
474 ///
475 /// \param V
476 /// A 256-bit vector of [4 x double].
477 /// \returns A 256-bit vector of [4 x double] containing the rounded down
478 /// values.
479 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
480 
481 /// Rounds up the values stored in a 256-bit vector of [8 x float]. The
482 /// source values are rounded up to integer values and returned as
483 /// floating-point values.
484 ///
485 /// \headerfile <x86intrin.h>
486 ///
487 /// \code
488 /// __m256 _mm256_ceil_ps(__m256 V);
489 /// \endcode
490 ///
491 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
492 ///
493 /// \param V
494 /// A 256-bit vector of [8 x float].
495 /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
496 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
497 
498 /// Rounds down the values stored in a 256-bit vector of [8 x float]. The
499 /// source values are rounded down to integer values and returned as
500 /// floating-point values.
501 ///
502 /// \headerfile <x86intrin.h>
503 ///
504 /// \code
505 /// __m256 _mm256_floor_ps(__m256 V);
506 /// \endcode
507 ///
508 /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
509 ///
510 /// \param V
511 /// A 256-bit vector of [8 x float].
512 /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
513 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
514 
515 /* Logical */
516 /// Performs a bitwise AND of two 256-bit vectors of [4 x double].
517 ///
518 /// \headerfile <x86intrin.h>
519 ///
520 /// This intrinsic corresponds to the <c> VANDPD </c> instruction.
521 ///
522 /// \param __a
523 /// A 256-bit vector of [4 x double] containing one of the source operands.
524 /// \param __b
525 /// A 256-bit vector of [4 x double] containing one of the source operands.
526 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
527 /// values between both operands.
528 static __inline __m256d __DEFAULT_FN_ATTRS
529 _mm256_and_pd(__m256d __a, __m256d __b)
530 {
531  return (__m256d)((__v4du)__a & (__v4du)__b);
532 }
533 
534 /// Performs a bitwise AND of two 256-bit vectors of [8 x float].
535 ///
536 /// \headerfile <x86intrin.h>
537 ///
538 /// This intrinsic corresponds to the <c> VANDPS </c> instruction.
539 ///
540 /// \param __a
541 /// A 256-bit vector of [8 x float] containing one of the source operands.
542 /// \param __b
543 /// A 256-bit vector of [8 x float] containing one of the source operands.
544 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
545 /// values between both operands.
546 static __inline __m256 __DEFAULT_FN_ATTRS
547 _mm256_and_ps(__m256 __a, __m256 __b)
548 {
549  return (__m256)((__v8su)__a & (__v8su)__b);
550 }
551 
552 /// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
553 /// the one's complement of the values contained in the first source operand.
554 ///
555 /// \headerfile <x86intrin.h>
556 ///
557 /// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
558 ///
559 /// \param __a
560 /// A 256-bit vector of [4 x double] containing the left source operand. The
561 /// one's complement of this value is used in the bitwise AND.
562 /// \param __b
563 /// A 256-bit vector of [4 x double] containing the right source operand.
564 /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
565 /// values of the second operand and the one's complement of the first
566 /// operand.
567 static __inline __m256d __DEFAULT_FN_ATTRS
568 _mm256_andnot_pd(__m256d __a, __m256d __b)
569 {
570  return (__m256d)(~(__v4du)__a & (__v4du)__b);
571 }
572 
573 /// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
574 /// the one's complement of the values contained in the first source operand.
575 ///
576 /// \headerfile <x86intrin.h>
577 ///
578 /// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
579 ///
580 /// \param __a
581 /// A 256-bit vector of [8 x float] containing the left source operand. The
582 /// one's complement of this value is used in the bitwise AND.
583 /// \param __b
584 /// A 256-bit vector of [8 x float] containing the right source operand.
585 /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
586 /// values of the second operand and the one's complement of the first
587 /// operand.
588 static __inline __m256 __DEFAULT_FN_ATTRS
589 _mm256_andnot_ps(__m256 __a, __m256 __b)
590 {
591  return (__m256)(~(__v8su)__a & (__v8su)__b);
592 }
593 
594 /// Performs a bitwise OR of two 256-bit vectors of [4 x double].
595 ///
596 /// \headerfile <x86intrin.h>
597 ///
598 /// This intrinsic corresponds to the <c> VORPD </c> instruction.
599 ///
600 /// \param __a
601 /// A 256-bit vector of [4 x double] containing one of the source operands.
602 /// \param __b
603 /// A 256-bit vector of [4 x double] containing one of the source operands.
604 /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
605 /// values between both operands.
606 static __inline __m256d __DEFAULT_FN_ATTRS
607 _mm256_or_pd(__m256d __a, __m256d __b)
608 {
609  return (__m256d)((__v4du)__a | (__v4du)__b);
610 }
611 
612 /// Performs a bitwise OR of two 256-bit vectors of [8 x float].
613 ///
614 /// \headerfile <x86intrin.h>
615 ///
616 /// This intrinsic corresponds to the <c> VORPS </c> instruction.
617 ///
618 /// \param __a
619 /// A 256-bit vector of [8 x float] containing one of the source operands.
620 /// \param __b
621 /// A 256-bit vector of [8 x float] containing one of the source operands.
622 /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
623 /// values between both operands.
624 static __inline __m256 __DEFAULT_FN_ATTRS
625 _mm256_or_ps(__m256 __a, __m256 __b)
626 {
627  return (__m256)((__v8su)__a | (__v8su)__b);
628 }
629 
630 /// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
631 ///
632 /// \headerfile <x86intrin.h>
633 ///
634 /// This intrinsic corresponds to the <c> VXORPD </c> instruction.
635 ///
636 /// \param __a
637 /// A 256-bit vector of [4 x double] containing one of the source operands.
638 /// \param __b
639 /// A 256-bit vector of [4 x double] containing one of the source operands.
640 /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
641 /// values between both operands.
642 static __inline __m256d __DEFAULT_FN_ATTRS
643 _mm256_xor_pd(__m256d __a, __m256d __b)
644 {
645  return (__m256d)((__v4du)__a ^ (__v4du)__b);
646 }
647 
648 /// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
649 ///
650 /// \headerfile <x86intrin.h>
651 ///
652 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
653 ///
654 /// \param __a
655 /// A 256-bit vector of [8 x float] containing one of the source operands.
656 /// \param __b
657 /// A 256-bit vector of [8 x float] containing one of the source operands.
658 /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
659 /// values between both operands.
660 static __inline __m256 __DEFAULT_FN_ATTRS
661 _mm256_xor_ps(__m256 __a, __m256 __b)
662 {
663  return (__m256)((__v8su)__a ^ (__v8su)__b);
664 }
665 
666 /* Horizontal arithmetic */
667 /// Horizontally adds the adjacent pairs of values contained in two
668 /// 256-bit vectors of [4 x double].
669 ///
670 /// \headerfile <x86intrin.h>
671 ///
672 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
673 ///
674 /// \param __a
675 /// A 256-bit vector of [4 x double] containing one of the source operands.
676 /// The horizontal sums of the values are returned in the even-indexed
677 /// elements of a vector of [4 x double].
678 /// \param __b
679 /// A 256-bit vector of [4 x double] containing one of the source operands.
680 /// The horizontal sums of the values are returned in the odd-indexed
681 /// elements of a vector of [4 x double].
682 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
683 /// both operands.
684 static __inline __m256d __DEFAULT_FN_ATTRS
685 _mm256_hadd_pd(__m256d __a, __m256d __b)
686 {
687  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
688 }
689 
690 /// Horizontally adds the adjacent pairs of values contained in two
691 /// 256-bit vectors of [8 x float].
692 ///
693 /// \headerfile <x86intrin.h>
694 ///
695 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
696 ///
697 /// \param __a
698 /// A 256-bit vector of [8 x float] containing one of the source operands.
699 /// The horizontal sums of the values are returned in the elements with
700 /// index 0, 1, 4, 5 of a vector of [8 x float].
701 /// \param __b
702 /// A 256-bit vector of [8 x float] containing one of the source operands.
703 /// The horizontal sums of the values are returned in the elements with
704 /// index 2, 3, 6, 7 of a vector of [8 x float].
705 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
706 /// both operands.
707 static __inline __m256 __DEFAULT_FN_ATTRS
708 _mm256_hadd_ps(__m256 __a, __m256 __b)
709 {
710  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
711 }
712 
713 /// Horizontally subtracts the adjacent pairs of values contained in two
714 /// 256-bit vectors of [4 x double].
715 ///
716 /// \headerfile <x86intrin.h>
717 ///
718 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
719 ///
720 /// \param __a
721 /// A 256-bit vector of [4 x double] containing one of the source operands.
722 /// The horizontal differences between the values are returned in the
723 /// even-indexed elements of a vector of [4 x double].
724 /// \param __b
725 /// A 256-bit vector of [4 x double] containing one of the source operands.
726 /// The horizontal differences between the values are returned in the
727 /// odd-indexed elements of a vector of [4 x double].
728 /// \returns A 256-bit vector of [4 x double] containing the horizontal
729 /// differences of both operands.
730 static __inline __m256d __DEFAULT_FN_ATTRS
731 _mm256_hsub_pd(__m256d __a, __m256d __b)
732 {
733  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
734 }
735 
736 /// Horizontally subtracts the adjacent pairs of values contained in two
737 /// 256-bit vectors of [8 x float].
738 ///
739 /// \headerfile <x86intrin.h>
740 ///
741 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
742 ///
743 /// \param __a
744 /// A 256-bit vector of [8 x float] containing one of the source operands.
745 /// The horizontal differences between the values are returned in the
746 /// elements with index 0, 1, 4, 5 of a vector of [8 x float].
747 /// \param __b
748 /// A 256-bit vector of [8 x float] containing one of the source operands.
749 /// The horizontal differences between the values are returned in the
750 /// elements with index 2, 3, 6, 7 of a vector of [8 x float].
751 /// \returns A 256-bit vector of [8 x float] containing the horizontal
752 /// differences of both operands.
753 static __inline __m256 __DEFAULT_FN_ATTRS
754 _mm256_hsub_ps(__m256 __a, __m256 __b)
755 {
756  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
757 }
758 
759 /* Vector permutations */
760 /// Copies the values in a 128-bit vector of [2 x double] as specified
761 /// by the 128-bit integer vector operand.
762 ///
763 /// \headerfile <x86intrin.h>
764 ///
765 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
766 ///
767 /// \param __a
768 /// A 128-bit vector of [2 x double].
769 /// \param __c
770 /// A 128-bit integer vector operand specifying how the values are to be
771 /// copied. \n
772 /// Bit [1]: \n
773 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
774 /// vector. \n
775 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the
776 /// returned vector. \n
777 /// Bit [65]: \n
778 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the
779 /// returned vector. \n
780 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the
781 /// returned vector.
782 /// \returns A 128-bit vector of [2 x double] containing the copied values.
783 static __inline __m128d __DEFAULT_FN_ATTRS
784 _mm_permutevar_pd(__m128d __a, __m128i __c)
785 {
786  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
787 }
788 
789 /// Copies the values in a 256-bit vector of [4 x double] as specified
790 /// by the 256-bit integer vector operand.
791 ///
792 /// \headerfile <x86intrin.h>
793 ///
794 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
795 ///
796 /// \param __a
797 /// A 256-bit vector of [4 x double].
798 /// \param __c
799 /// A 256-bit integer vector operand specifying how the values are to be
800 /// copied. \n
801 /// Bit [1]: \n
802 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
803 /// vector. \n
804 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the
805 /// returned vector. \n
806 /// Bit [65]: \n
807 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the
808 /// returned vector. \n
809 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the
810 /// returned vector. \n
811 /// Bit [129]: \n
812 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the
813 /// returned vector. \n
814 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the
815 /// returned vector. \n
816 /// Bit [193]: \n
817 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the
818 /// returned vector. \n
819 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the
820 /// returned vector.
821 /// \returns A 256-bit vector of [4 x double] containing the copied values.
822 static __inline __m256d __DEFAULT_FN_ATTRS
823 _mm256_permutevar_pd(__m256d __a, __m256i __c)
824 {
825  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
826 }
827 
828 /// Copies the values stored in a 128-bit vector of [4 x float] as
829 /// specified by the 128-bit integer vector operand.
830 /// \headerfile <x86intrin.h>
831 ///
832 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
833 ///
834 /// \param __a
835 /// A 128-bit vector of [4 x float].
836 /// \param __c
837 /// A 128-bit integer vector operand specifying how the values are to be
838 /// copied. \n
839 /// Bits [1:0]: \n
840 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the
841 /// returned vector. \n
842 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the
843 /// returned vector. \n
844 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the
845 /// returned vector. \n
846 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the
847 /// returned vector. \n
848 /// Bits [33:32]: \n
849 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the
850 /// returned vector. \n
851 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the
852 /// returned vector. \n
853 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the
854 /// returned vector. \n
855 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the
856 /// returned vector. \n
857 /// Bits [65:64]: \n
858 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the
859 /// returned vector. \n
860 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the
861 /// returned vector. \n
862 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the
863 /// returned vector. \n
864 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the
865 /// returned vector. \n
866 /// Bits [97:96]: \n
867 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the
868 /// returned vector. \n
869 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the
870 /// returned vector. \n
871 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the
872 /// returned vector. \n
873 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the
874 /// returned vector.
875 /// \returns A 128-bit vector of [4 x float] containing the copied values.
876 static __inline __m128 __DEFAULT_FN_ATTRS
877 _mm_permutevar_ps(__m128 __a, __m128i __c)
878 {
879  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
880 }
881 
882 /// Copies the values stored in a 256-bit vector of [8 x float] as
883 /// specified by the 256-bit integer vector operand.
884 ///
885 /// \headerfile <x86intrin.h>
886 ///
887 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
888 ///
889 /// \param __a
890 /// A 256-bit vector of [8 x float].
891 /// \param __c
892 /// A 256-bit integer vector operand specifying how the values are to be
893 /// copied. \n
894 /// Bits [1:0]: \n
895 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the
896 /// returned vector. \n
897 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the
898 /// returned vector. \n
899 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the
900 /// returned vector. \n
901 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the
902 /// returned vector. \n
903 /// Bits [33:32]: \n
904 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the
905 /// returned vector. \n
906 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the
907 /// returned vector. \n
908 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the
909 /// returned vector. \n
910 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the
911 /// returned vector. \n
912 /// Bits [65:64]: \n
913 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the
914 /// returned vector. \n
915 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the
916 /// returned vector. \n
917 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the
918 /// returned vector. \n
919 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the
920 /// returned vector. \n
921 /// Bits [97:96]: \n
922 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the
923 /// returned vector. \n
924 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the
925 /// returned vector. \n
926 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the
927 /// returned vector. \n
928 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the
929 /// returned vector. \n
930 /// Bits [129:128]: \n
931 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the
932 /// returned vector. \n
933 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the
934 /// returned vector. \n
935 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the
936 /// returned vector. \n
937 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the
938 /// returned vector. \n
939 /// Bits [161:160]: \n
940 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the
941 /// returned vector. \n
942 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the
943 /// returned vector. \n
944 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the
945 /// returned vector. \n
946 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the
947 /// returned vector. \n
948 /// Bits [193:192]: \n
949 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the
950 /// returned vector. \n
951 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the
952 /// returned vector. \n
953 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the
954 /// returned vector. \n
955 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the
956 /// returned vector. \n
957 /// Bits [225:224]: \n
958 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the
959 /// returned vector. \n
960 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the
961 /// returned vector. \n
962 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the
963 /// returned vector. \n
964 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the
965 /// returned vector.
966 /// \returns A 256-bit vector of [8 x float] containing the copied values.
967 static __inline __m256 __DEFAULT_FN_ATTRS
968 _mm256_permutevar_ps(__m256 __a, __m256i __c)
969 {
970  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
971 }
972 
973 /// Copies the values in a 128-bit vector of [2 x double] as specified
974 /// by the immediate integer operand.
975 ///
976 /// \headerfile <x86intrin.h>
977 ///
978 /// \code
979 /// __m128d _mm_permute_pd(__m128d A, const int C);
980 /// \endcode
981 ///
982 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
983 ///
984 /// \param A
985 /// A 128-bit vector of [2 x double].
986 /// \param C
987 /// An immediate integer operand specifying how the values are to be
988 /// copied. \n
989 /// Bit [0]: \n
990 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
991 /// vector. \n
992 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the
993 /// returned vector. \n
994 /// Bit [1]: \n
995 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the
996 /// returned vector. \n
997 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the
998 /// returned vector.
999 /// \returns A 128-bit vector of [2 x double] containing the copied values.
1000 #define _mm_permute_pd(A, C) \
1001  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
1002 
1003 /// Copies the values in a 256-bit vector of [4 x double] as specified by
1004 /// the immediate integer operand.
1005 ///
1006 /// \headerfile <x86intrin.h>
1007 ///
1008 /// \code
1009 /// __m256d _mm256_permute_pd(__m256d A, const int C);
1010 /// \endcode
1011 ///
1012 /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1013 ///
1014 /// \param A
1015 /// A 256-bit vector of [4 x double].
1016 /// \param C
1017 /// An immediate integer operand specifying how the values are to be
1018 /// copied. \n
1019 /// Bit [0]: \n
1020 /// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1021 /// vector. \n
1022 /// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1023 /// returned vector. \n
1024 /// Bit [1]: \n
1025 /// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1026 /// returned vector. \n
1027 /// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1028 /// returned vector. \n
1029 /// Bit [2]: \n
1030 /// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1031 /// returned vector. \n
1032 /// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1033 /// returned vector. \n
1034 /// Bit [3]: \n
1035 /// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1036 /// returned vector. \n
1037 /// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1038 /// returned vector.
1039 /// \returns A 256-bit vector of [4 x double] containing the copied values.
1040 #define _mm256_permute_pd(A, C) \
1041  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
1042 
1043 /// Copies the values in a 128-bit vector of [4 x float] as specified by
1044 /// the immediate integer operand.
1045 ///
1046 /// \headerfile <x86intrin.h>
1047 ///
1048 /// \code
1049 /// __m128 _mm_permute_ps(__m128 A, const int C);
1050 /// \endcode
1051 ///
1052 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1053 ///
1054 /// \param A
1055 /// A 128-bit vector of [4 x float].
1056 /// \param C
1057 /// An immediate integer operand specifying how the values are to be
1058 /// copied. \n
1059 /// Bits [1:0]: \n
1060 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1061 /// returned vector. \n
1062 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1063 /// returned vector. \n
1064 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1065 /// returned vector. \n
1066 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1067 /// returned vector. \n
1068 /// Bits [3:2]: \n
1069 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1070 /// returned vector. \n
1071 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1072 /// returned vector. \n
1073 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1074 /// returned vector. \n
1075 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1076 /// returned vector. \n
1077 /// Bits [5:4]: \n
1078 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1079 /// returned vector. \n
1080 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1081 /// returned vector. \n
1082 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1083 /// returned vector. \n
1084 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1085 /// returned vector. \n
1086 /// Bits [7:6]: \n
1087 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1088 /// returned vector. \n
1089 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1090 /// returned vector. \n
1091 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1092 /// returned vector. \n
1093 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1094 /// returned vector.
1095 /// \returns A 128-bit vector of [4 x float] containing the copied values.
1096 #define _mm_permute_ps(A, C) \
1097  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
1098 
1099 /// Copies the values in a 256-bit vector of [8 x float] as specified by
1100 /// the immediate integer operand.
1101 ///
1102 /// \headerfile <x86intrin.h>
1103 ///
1104 /// \code
1105 /// __m256 _mm256_permute_ps(__m256 A, const int C);
1106 /// \endcode
1107 ///
1108 /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1109 ///
1110 /// \param A
1111 /// A 256-bit vector of [8 x float].
1112 /// \param C
1113 /// An immediate integer operand specifying how the values are to be
1114 /// copied. \n
1115 /// Bits [1:0]: \n
1116 /// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1117 /// returned vector. \n
1118 /// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1119 /// returned vector. \n
1120 /// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1121 /// returned vector. \n
1122 /// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1123 /// returned vector. \n
1124 /// Bits [3:2]: \n
1125 /// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1126 /// returned vector. \n
1127 /// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1128 /// returned vector. \n
1129 /// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1130 /// returned vector. \n
1131 /// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1132 /// returned vector. \n
1133 /// Bits [5:4]: \n
1134 /// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1135 /// returned vector. \n
1136 /// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1137 /// returned vector. \n
1138 /// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1139 /// returned vector. \n
1140 /// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1141 /// returned vector. \n
1142 /// Bits [7:6]: \n
1143 /// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1144 /// returned vector. \n
1145 /// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1146 /// returned vector. \n
1147 /// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1148 /// returned vector. \n
1149 /// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1150 /// returned vector. \n
1151 /// Bits [1:0]: \n
1152 /// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1153 /// returned vector. \n
1154 /// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1155 /// returned vector. \n
1156 /// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1157 /// returned vector. \n
1158 /// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1159 /// returned vector. \n
1160 /// Bits [3:2]: \n
1161 /// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1162 /// returned vector. \n
1163 /// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1164 /// returned vector. \n
1165 /// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1166 /// returned vector. \n
1167 /// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1168 /// returned vector. \n
1169 /// Bits [5:4]: \n
1170 /// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1171 /// returned vector. \n
1172 /// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1173 /// returned vector. \n
1174 /// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1175 /// returned vector. \n
1176 /// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1177 /// returned vector. \n
1178 /// Bits [7:6]: \n
1179 /// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1180 /// returned vector. \n
1181 /// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1182 /// returned vector. \n
1183 /// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1184 /// returned vector. \n
1185 /// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1186 /// returned vector.
1187 /// \returns A 256-bit vector of [8 x float] containing the copied values.
1188 #define _mm256_permute_ps(A, C) \
1189  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
1190 
1191 /// Permutes 128-bit data values stored in two 256-bit vectors of
1192 /// [4 x double], as specified by the immediate integer operand.
1193 ///
1194 /// \headerfile <x86intrin.h>
1195 ///
1196 /// \code
1197 /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1198 /// \endcode
1199 ///
1200 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1201 ///
1202 /// \param V1
1203 /// A 256-bit vector of [4 x double].
1204 /// \param V2
1205 /// A 256-bit vector of [4 x double.
1206 /// \param M
1207 /// An immediate integer operand specifying how the values are to be
1208 /// permuted. \n
1209 /// Bits [1:0]: \n
1210 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1211 /// destination. \n
1212 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1213 /// destination. \n
1214 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1215 /// destination. \n
1216 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1217 /// destination. \n
1218 /// Bits [5:4]: \n
1219 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1220 /// destination. \n
1221 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1222 /// destination. \n
1223 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1224 /// destination. \n
1225 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1226 /// destination.
1227 /// \returns A 256-bit vector of [4 x double] containing the copied values.
1228 #define _mm256_permute2f128_pd(V1, V2, M) \
1229  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1230  (__v4df)(__m256d)(V2), (int)(M))
1231 
1232 /// Permutes 128-bit data values stored in two 256-bit vectors of
1233 /// [8 x float], as specified by the immediate integer operand.
1234 ///
1235 /// \headerfile <x86intrin.h>
1236 ///
1237 /// \code
1238 /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1239 /// \endcode
1240 ///
1241 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1242 ///
1243 /// \param V1
1244 /// A 256-bit vector of [8 x float].
1245 /// \param V2
1246 /// A 256-bit vector of [8 x float].
1247 /// \param M
1248 /// An immediate integer operand specifying how the values are to be
1249 /// permuted. \n
1250 /// Bits [1:0]: \n
1251 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1252 /// destination. \n
1253 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1254 /// destination. \n
1255 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1256 /// destination. \n
1257 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1258 /// destination. \n
1259 /// Bits [5:4]: \n
1260 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1261 /// destination. \n
1262 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1263 /// destination. \n
1264 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1265 /// destination. \n
1266 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1267 /// destination.
1268 /// \returns A 256-bit vector of [8 x float] containing the copied values.
1269 #define _mm256_permute2f128_ps(V1, V2, M) \
1270  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1271  (__v8sf)(__m256)(V2), (int)(M))
1272 
1273 /// Permutes 128-bit data values stored in two 256-bit integer vectors,
1274 /// as specified by the immediate integer operand.
1275 ///
1276 /// \headerfile <x86intrin.h>
1277 ///
1278 /// \code
1279 /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1280 /// \endcode
1281 ///
1282 /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1283 ///
1284 /// \param V1
1285 /// A 256-bit integer vector.
1286 /// \param V2
1287 /// A 256-bit integer vector.
1288 /// \param M
1289 /// An immediate integer operand specifying how the values are to be copied.
1290 /// Bits [1:0]: \n
1291 /// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1292 /// destination. \n
1293 /// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1294 /// destination. \n
1295 /// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1296 /// destination. \n
1297 /// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1298 /// destination. \n
1299 /// Bits [5:4]: \n
1300 /// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1301 /// destination. \n
1302 /// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1303 /// destination. \n
1304 /// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1305 /// destination. \n
1306 /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1307 /// destination.
1308 /// \returns A 256-bit integer vector containing the copied values.
1309 #define _mm256_permute2f128_si256(V1, V2, M) \
1310  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1311  (__v8si)(__m256i)(V2), (int)(M))
1312 
1313 /* Vector Blend */
1314 /// Merges 64-bit double-precision data values stored in either of the
1315 /// two 256-bit vectors of [4 x double], as specified by the immediate
1316 /// integer operand.
1317 ///
1318 /// \headerfile <x86intrin.h>
1319 ///
1320 /// \code
1321 /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1322 /// \endcode
1323 ///
1324 /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1325 ///
1326 /// \param V1
1327 /// A 256-bit vector of [4 x double].
1328 /// \param V2
1329 /// A 256-bit vector of [4 x double].
1330 /// \param M
1331 /// An immediate integer operand, with mask bits [3:0] specifying how the
1332 /// values are to be copied. The position of the mask bit corresponds to the
1333 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1334 /// element in operand \a V1 is copied to the same position in the
1335 /// destination. When a mask bit is 1, the corresponding 64-bit element in
1336 /// operand \a V2 is copied to the same position in the destination.
1337 /// \returns A 256-bit vector of [4 x double] containing the copied values.
1338 #define _mm256_blend_pd(V1, V2, M) \
1339  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1340  (__v4df)(__m256d)(V2), (int)(M))
1341 
1342 /// Merges 32-bit single-precision data values stored in either of the
1343 /// two 256-bit vectors of [8 x float], as specified by the immediate
1344 /// integer operand.
1345 ///
1346 /// \headerfile <x86intrin.h>
1347 ///
1348 /// \code
1349 /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1350 /// \endcode
1351 ///
1352 /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1353 ///
1354 /// \param V1
1355 /// A 256-bit vector of [8 x float].
1356 /// \param V2
1357 /// A 256-bit vector of [8 x float].
1358 /// \param M
1359 /// An immediate integer operand, with mask bits [7:0] specifying how the
1360 /// values are to be copied. The position of the mask bit corresponds to the
1361 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1362 /// element in operand \a V1 is copied to the same position in the
1363 /// destination. When a mask bit is 1, the corresponding 32-bit element in
1364 /// operand \a V2 is copied to the same position in the destination.
1365 /// \returns A 256-bit vector of [8 x float] containing the copied values.
1366 #define _mm256_blend_ps(V1, V2, M) \
1367  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1368  (__v8sf)(__m256)(V2), (int)(M))
1369 
1370 /// Merges 64-bit double-precision data values stored in either of the
1371 /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1372 /// operand.
1373 ///
1374 /// \headerfile <x86intrin.h>
1375 ///
1376 /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1377 ///
1378 /// \param __a
1379 /// A 256-bit vector of [4 x double].
1380 /// \param __b
1381 /// A 256-bit vector of [4 x double].
1382 /// \param __c
1383 /// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1384 /// how the values are to be copied. The position of the mask bit corresponds
1385 /// to the most significant bit of a copied value. When a mask bit is 0, the
1386 /// corresponding 64-bit element in operand \a __a is copied to the same
1387 /// position in the destination. When a mask bit is 1, the corresponding
1388 /// 64-bit element in operand \a __b is copied to the same position in the
1389 /// destination.
1390 /// \returns A 256-bit vector of [4 x double] containing the copied values.
1391 static __inline __m256d __DEFAULT_FN_ATTRS
1392 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1393 {
1394  return (__m256d)__builtin_ia32_blendvpd256(
1395  (__v4df)__a, (__v4df)__b, (__v4df)__c);
1396 }
1397 
1398 /// Merges 32-bit single-precision data values stored in either of the
1399 /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1400 /// operand.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1405 ///
1406 /// \param __a
1407 /// A 256-bit vector of [8 x float].
1408 /// \param __b
1409 /// A 256-bit vector of [8 x float].
1410 /// \param __c
1411 /// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1412 /// and 31 specifying how the values are to be copied. The position of the
1413 /// mask bit corresponds to the most significant bit of a copied value. When
1414 /// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1415 /// copied to the same position in the destination. When a mask bit is 1, the
1416 /// corresponding 32-bit element in operand \a __b is copied to the same
1417 /// position in the destination.
1418 /// \returns A 256-bit vector of [8 x float] containing the copied values.
1419 static __inline __m256 __DEFAULT_FN_ATTRS
1420 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1421 {
1422  return (__m256)__builtin_ia32_blendvps256(
1423  (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1424 }
1425 
1426 /* Vector Dot Product */
1427 /// Computes two dot products in parallel, using the lower and upper
1428 /// halves of two [8 x float] vectors as input to the two computations, and
1429 /// returning the two dot products in the lower and upper halves of the
1430 /// [8 x float] result.
1431 ///
1432 /// The immediate integer operand controls which input elements will
1433 /// contribute to the dot product, and where the final results are returned.
1434 /// In general, for each dot product, the four corresponding elements of the
1435 /// input vectors are multiplied; the first two and second two products are
1436 /// summed, then the two sums are added to form the final result.
1437 ///
1438 /// \headerfile <x86intrin.h>
1439 ///
1440 /// \code
1441 /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1442 /// \endcode
1443 ///
1444 /// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1445 ///
1446 /// \param V1
1447 /// A vector of [8 x float] values, treated as two [4 x float] vectors.
1448 /// \param V2
1449 /// A vector of [8 x float] values, treated as two [4 x float] vectors.
1450 /// \param M
1451 /// An immediate integer argument. Bits [7:4] determine which elements of
1452 /// the input vectors are used, with bit [4] corresponding to the lowest
1453 /// element and bit [7] corresponding to the highest element of each [4 x
1454 /// float] subvector. If a bit is set, the corresponding elements from the
1455 /// two input vectors are used as an input for dot product; otherwise that
1456 /// input is treated as zero. Bits [3:0] determine which elements of the
1457 /// result will receive a copy of the final dot product, with bit [0]
1458 /// corresponding to the lowest element and bit [3] corresponding to the
1459 /// highest element of each [4 x float] subvector. If a bit is set, the dot
1460 /// product is returned in the corresponding element; otherwise that element
1461 /// is set to zero. The bitmask is applied in the same way to each of the
1462 /// two parallel dot product computations.
1463 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
1464 #define _mm256_dp_ps(V1, V2, M) \
1465  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1466  (__v8sf)(__m256)(V2), (M))
1467 
1468 /* Vector shuffle */
1469 /// Selects 8 float values from the 256-bit operands of [8 x float], as
1470 /// specified by the immediate value operand.
1471 ///
1472 /// The four selected elements in each operand are copied to the destination
1473 /// according to the bits specified in the immediate operand. The selected
1474 /// elements from the first 256-bit operand are copied to bits [63:0] and
1475 /// bits [191:128] of the destination, and the selected elements from the
1476 /// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1477 /// the destination. For example, if bits [7:0] of the immediate operand
1478 /// contain a value of 0xFF, the 256-bit destination vector would contain the
1479 /// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1480 ///
1481 /// \headerfile <x86intrin.h>
1482 ///
1483 /// \code
1484 /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1485 /// \endcode
1486 ///
1487 /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1488 ///
1489 /// \param a
1490 /// A 256-bit vector of [8 x float]. The four selected elements in this
1491 /// operand are copied to bits [63:0] and bits [191:128] in the destination,
1492 /// according to the bits specified in the immediate operand.
1493 /// \param b
1494 /// A 256-bit vector of [8 x float]. The four selected elements in this
1495 /// operand are copied to bits [127:64] and bits [255:192] in the
1496 /// destination, according to the bits specified in the immediate operand.
1497 /// \param mask
1498 /// An immediate value containing an 8-bit value specifying which elements to
1499 /// copy from \a a and \a b \n.
1500 /// Bits [3:0] specify the values copied from operand \a a. \n
1501 /// Bits [7:4] specify the values copied from operand \a b. \n
1502 /// The destinations within the 256-bit destination are assigned values as
1503 /// follows, according to the bit value assignments described below: \n
1504 /// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1505 /// destination. \n
1506 /// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1507 /// destination. \n
1508 /// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1509 /// destination. \n
1510 /// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1511 /// the destination. \n
1512 /// Bit value assignments: \n
1513 /// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1514 /// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1515 /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1516 /// 11: Bits [127:96] and [255:224] are copied from the selected operand.
1517 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1518 #define _mm256_shuffle_ps(a, b, mask) \
1519  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1520  (__v8sf)(__m256)(b), (int)(mask))
1521 
1522 /// Selects four double-precision values from the 256-bit operands of
1523 /// [4 x double], as specified by the immediate value operand.
1524 ///
1525 /// The selected elements from the first 256-bit operand are copied to bits
1526 /// [63:0] and bits [191:128] in the destination, and the selected elements
1527 /// from the second 256-bit operand are copied to bits [127:64] and bits
1528 /// [255:192] in the destination. For example, if bits [3:0] of the immediate
1529 /// operand contain a value of 0xF, the 256-bit destination vector would
1530 /// contain the following values: b[3], a[3], b[1], a[1].
1531 ///
1532 /// \headerfile <x86intrin.h>
1533 ///
1534 /// \code
1535 /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1536 /// \endcode
1537 ///
1538 /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1539 ///
1540 /// \param a
1541 /// A 256-bit vector of [4 x double].
1542 /// \param b
1543 /// A 256-bit vector of [4 x double].
1544 /// \param mask
1545 /// An immediate value containing 8-bit values specifying which elements to
1546 /// copy from \a a and \a b: \n
1547 /// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1548 /// destination. \n
1549 /// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1550 /// destination. \n
1551 /// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1552 /// destination. \n
1553 /// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1554 /// destination. \n
1555 /// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1556 /// destination. \n
1557 /// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1558 /// destination. \n
1559 /// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1560 /// destination. \n
1561 /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1562 /// destination.
1563 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1564 #define _mm256_shuffle_pd(a, b, mask) \
1565  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1566  (__v4df)(__m256d)(b), (int)(mask))
1567 
1568 /* Compare */
1569 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1570 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1571 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1572 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1573 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1574 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1575 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1576 #define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
1577 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1578 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1579 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1580 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1581 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1582 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1583 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1584 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1585 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1586 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1587 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1588 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1589 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1590 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1591 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1592 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1593 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1594 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1595 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1596 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1597 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1598 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1599 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1600 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1601 
1602 /// Compares each of the corresponding double-precision values of two
1603 /// 128-bit vectors of [2 x double], using the operation specified by the
1604 /// immediate integer operand.
1605 ///
1606 /// Returns a [2 x double] vector consisting of two doubles corresponding to
1607 /// the two comparison results: zero if the comparison is false, and all 1's
1608 /// if the comparison is true.
1609 ///
1610 /// \headerfile <x86intrin.h>
1611 ///
1612 /// \code
1613 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1614 /// \endcode
1615 ///
1616 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1617 ///
1618 /// \param a
1619 /// A 128-bit vector of [2 x double].
1620 /// \param b
1621 /// A 128-bit vector of [2 x double].
1622 /// \param c
1623 /// An immediate integer operand, with bits [4:0] specifying which comparison
1624 /// operation to use: \n
1625 /// 0x00: Equal (ordered, non-signaling) \n
1626 /// 0x01: Less-than (ordered, signaling) \n
1627 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1628 /// 0x03: Unordered (non-signaling) \n
1629 /// 0x04: Not-equal (unordered, non-signaling) \n
1630 /// 0x05: Not-less-than (unordered, signaling) \n
1631 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1632 /// 0x07: Ordered (non-signaling) \n
1633 /// 0x08: Equal (unordered, non-signaling) \n
1634 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1635 /// 0x0A: Not-greater-than (unordered, signaling) \n
1636 /// 0x0B: False (ordered, non-signaling) \n
1637 /// 0x0C: Not-equal (ordered, non-signaling) \n
1638 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1639 /// 0x0E: Greater-than (ordered, signaling) \n
1640 /// 0x0F: True (unordered, non-signaling) \n
1641 /// 0x10: Equal (ordered, signaling) \n
1642 /// 0x11: Less-than (ordered, non-signaling) \n
1643 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1644 /// 0x13: Unordered (signaling) \n
1645 /// 0x14: Not-equal (unordered, signaling) \n
1646 /// 0x15: Not-less-than (unordered, non-signaling) \n
1647 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1648 /// 0x17: Ordered (signaling) \n
1649 /// 0x18: Equal (unordered, signaling) \n
1650 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1651 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1652 /// 0x1B: False (ordered, signaling) \n
1653 /// 0x1C: Not-equal (ordered, signaling) \n
1654 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1655 /// 0x1E: Greater-than (ordered, non-signaling) \n
1656 /// 0x1F: True (unordered, signaling)
1657 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
1658 #define _mm_cmp_pd(a, b, c) \
1659  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1660  (__v2df)(__m128d)(b), (c))
1661 
1662 /// Compares each of the corresponding values of two 128-bit vectors of
1663 /// [4 x float], using the operation specified by the immediate integer
1664 /// operand.
1665 ///
1666 /// Returns a [4 x float] vector consisting of four floats corresponding to
1667 /// the four comparison results: zero if the comparison is false, and all 1's
1668 /// if the comparison is true.
1669 ///
1670 /// \headerfile <x86intrin.h>
1671 ///
1672 /// \code
1673 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1674 /// \endcode
1675 ///
1676 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1677 ///
1678 /// \param a
1679 /// A 128-bit vector of [4 x float].
1680 /// \param b
1681 /// A 128-bit vector of [4 x float].
1682 /// \param c
1683 /// An immediate integer operand, with bits [4:0] specifying which comparison
1684 /// operation to use: \n
1685 /// 0x00: Equal (ordered, non-signaling) \n
1686 /// 0x01: Less-than (ordered, signaling) \n
1687 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1688 /// 0x03: Unordered (non-signaling) \n
1689 /// 0x04: Not-equal (unordered, non-signaling) \n
1690 /// 0x05: Not-less-than (unordered, signaling) \n
1691 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1692 /// 0x07: Ordered (non-signaling) \n
1693 /// 0x08: Equal (unordered, non-signaling) \n
1694 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1695 /// 0x0A: Not-greater-than (unordered, signaling) \n
1696 /// 0x0B: False (ordered, non-signaling) \n
1697 /// 0x0C: Not-equal (ordered, non-signaling) \n
1698 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1699 /// 0x0E: Greater-than (ordered, signaling) \n
1700 /// 0x0F: True (unordered, non-signaling) \n
1701 /// 0x10: Equal (ordered, signaling) \n
1702 /// 0x11: Less-than (ordered, non-signaling) \n
1703 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1704 /// 0x13: Unordered (signaling) \n
1705 /// 0x14: Not-equal (unordered, signaling) \n
1706 /// 0x15: Not-less-than (unordered, non-signaling) \n
1707 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1708 /// 0x17: Ordered (signaling) \n
1709 /// 0x18: Equal (unordered, signaling) \n
1710 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1711 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1712 /// 0x1B: False (ordered, signaling) \n
1713 /// 0x1C: Not-equal (ordered, signaling) \n
1714 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1715 /// 0x1E: Greater-than (ordered, non-signaling) \n
1716 /// 0x1F: True (unordered, signaling)
1717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1718 #define _mm_cmp_ps(a, b, c) \
1719  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1720  (__v4sf)(__m128)(b), (c))
1721 
1722 /// Compares each of the corresponding double-precision values of two
1723 /// 256-bit vectors of [4 x double], using the operation specified by the
1724 /// immediate integer operand.
1725 ///
1726 /// Returns a [4 x double] vector consisting of four doubles corresponding to
1727 /// the four comparison results: zero if the comparison is false, and all 1's
1728 /// if the comparison is true.
1729 ///
1730 /// \headerfile <x86intrin.h>
1731 ///
1732 /// \code
1733 /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1734 /// \endcode
1735 ///
1736 /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1737 ///
1738 /// \param a
1739 /// A 256-bit vector of [4 x double].
1740 /// \param b
1741 /// A 256-bit vector of [4 x double].
1742 /// \param c
1743 /// An immediate integer operand, with bits [4:0] specifying which comparison
1744 /// operation to use: \n
1745 /// 0x00: Equal (ordered, non-signaling) \n
1746 /// 0x01: Less-than (ordered, signaling) \n
1747 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1748 /// 0x03: Unordered (non-signaling) \n
1749 /// 0x04: Not-equal (unordered, non-signaling) \n
1750 /// 0x05: Not-less-than (unordered, signaling) \n
1751 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1752 /// 0x07: Ordered (non-signaling) \n
1753 /// 0x08: Equal (unordered, non-signaling) \n
1754 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1755 /// 0x0A: Not-greater-than (unordered, signaling) \n
1756 /// 0x0B: False (ordered, non-signaling) \n
1757 /// 0x0C: Not-equal (ordered, non-signaling) \n
1758 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1759 /// 0x0E: Greater-than (ordered, signaling) \n
1760 /// 0x0F: True (unordered, non-signaling) \n
1761 /// 0x10: Equal (ordered, signaling) \n
1762 /// 0x11: Less-than (ordered, non-signaling) \n
1763 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1764 /// 0x13: Unordered (signaling) \n
1765 /// 0x14: Not-equal (unordered, signaling) \n
1766 /// 0x15: Not-less-than (unordered, non-signaling) \n
1767 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1768 /// 0x17: Ordered (signaling) \n
1769 /// 0x18: Equal (unordered, signaling) \n
1770 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1771 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1772 /// 0x1B: False (ordered, signaling) \n
1773 /// 0x1C: Not-equal (ordered, signaling) \n
1774 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1775 /// 0x1E: Greater-than (ordered, non-signaling) \n
1776 /// 0x1F: True (unordered, signaling)
1777 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
1778 #define _mm256_cmp_pd(a, b, c) \
1779  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1780  (__v4df)(__m256d)(b), (c))
1781 
1782 /// Compares each of the corresponding values of two 256-bit vectors of
1783 /// [8 x float], using the operation specified by the immediate integer
1784 /// operand.
1785 ///
1786 /// Returns a [8 x float] vector consisting of eight floats corresponding to
1787 /// the eight comparison results: zero if the comparison is false, and all
1788 /// 1's if the comparison is true.
1789 ///
1790 /// \headerfile <x86intrin.h>
1791 ///
1792 /// \code
1793 /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1794 /// \endcode
1795 ///
1796 /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1797 ///
1798 /// \param a
1799 /// A 256-bit vector of [8 x float].
1800 /// \param b
1801 /// A 256-bit vector of [8 x float].
1802 /// \param c
1803 /// An immediate integer operand, with bits [4:0] specifying which comparison
1804 /// operation to use: \n
1805 /// 0x00: Equal (ordered, non-signaling) \n
1806 /// 0x01: Less-than (ordered, signaling) \n
1807 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1808 /// 0x03: Unordered (non-signaling) \n
1809 /// 0x04: Not-equal (unordered, non-signaling) \n
1810 /// 0x05: Not-less-than (unordered, signaling) \n
1811 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1812 /// 0x07: Ordered (non-signaling) \n
1813 /// 0x08: Equal (unordered, non-signaling) \n
1814 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1815 /// 0x0A: Not-greater-than (unordered, signaling) \n
1816 /// 0x0B: False (ordered, non-signaling) \n
1817 /// 0x0C: Not-equal (ordered, non-signaling) \n
1818 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1819 /// 0x0E: Greater-than (ordered, signaling) \n
1820 /// 0x0F: True (unordered, non-signaling) \n
1821 /// 0x10: Equal (ordered, signaling) \n
1822 /// 0x11: Less-than (ordered, non-signaling) \n
1823 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1824 /// 0x13: Unordered (signaling) \n
1825 /// 0x14: Not-equal (unordered, signaling) \n
1826 /// 0x15: Not-less-than (unordered, non-signaling) \n
1827 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1828 /// 0x17: Ordered (signaling) \n
1829 /// 0x18: Equal (unordered, signaling) \n
1830 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1831 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1832 /// 0x1B: False (ordered, signaling) \n
1833 /// 0x1C: Not-equal (ordered, signaling) \n
1834 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1835 /// 0x1E: Greater-than (ordered, non-signaling) \n
1836 /// 0x1F: True (unordered, signaling)
1837 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
1838 #define _mm256_cmp_ps(a, b, c) \
1839  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1840  (__v8sf)(__m256)(b), (c))
1841 
1842 /// Compares each of the corresponding scalar double-precision values of
1843 /// two 128-bit vectors of [2 x double], using the operation specified by the
1844 /// immediate integer operand.
1845 ///
1846 /// If the result is true, all 64 bits of the destination vector are set;
1847 /// otherwise they are cleared.
1848 ///
1849 /// \headerfile <x86intrin.h>
1850 ///
1851 /// \code
1852 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1853 /// \endcode
1854 ///
1855 /// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1856 ///
1857 /// \param a
1858 /// A 128-bit vector of [2 x double].
1859 /// \param b
1860 /// A 128-bit vector of [2 x double].
1861 /// \param c
1862 /// An immediate integer operand, with bits [4:0] specifying which comparison
1863 /// operation to use: \n
1864 /// 0x00: Equal (ordered, non-signaling) \n
1865 /// 0x01: Less-than (ordered, signaling) \n
1866 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1867 /// 0x03: Unordered (non-signaling) \n
1868 /// 0x04: Not-equal (unordered, non-signaling) \n
1869 /// 0x05: Not-less-than (unordered, signaling) \n
1870 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1871 /// 0x07: Ordered (non-signaling) \n
1872 /// 0x08: Equal (unordered, non-signaling) \n
1873 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1874 /// 0x0A: Not-greater-than (unordered, signaling) \n
1875 /// 0x0B: False (ordered, non-signaling) \n
1876 /// 0x0C: Not-equal (ordered, non-signaling) \n
1877 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1878 /// 0x0E: Greater-than (ordered, signaling) \n
1879 /// 0x0F: True (unordered, non-signaling) \n
1880 /// 0x10: Equal (ordered, signaling) \n
1881 /// 0x11: Less-than (ordered, non-signaling) \n
1882 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1883 /// 0x13: Unordered (signaling) \n
1884 /// 0x14: Not-equal (unordered, signaling) \n
1885 /// 0x15: Not-less-than (unordered, non-signaling) \n
1886 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1887 /// 0x17: Ordered (signaling) \n
1888 /// 0x18: Equal (unordered, signaling) \n
1889 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1890 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1891 /// 0x1B: False (ordered, signaling) \n
1892 /// 0x1C: Not-equal (ordered, signaling) \n
1893 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1894 /// 0x1E: Greater-than (ordered, non-signaling) \n
1895 /// 0x1F: True (unordered, signaling)
1896 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
1897 #define _mm_cmp_sd(a, b, c) \
1898  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1899  (__v2df)(__m128d)(b), (c))
1900 
1901 /// Compares each of the corresponding scalar values of two 128-bit
1902 /// vectors of [4 x float], using the operation specified by the immediate
1903 /// integer operand.
1904 ///
1905 /// If the result is true, all 32 bits of the destination vector are set;
1906 /// otherwise they are cleared.
1907 ///
1908 /// \headerfile <x86intrin.h>
1909 ///
1910 /// \code
1911 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1912 /// \endcode
1913 ///
1914 /// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1915 ///
1916 /// \param a
1917 /// A 128-bit vector of [4 x float].
1918 /// \param b
1919 /// A 128-bit vector of [4 x float].
1920 /// \param c
1921 /// An immediate integer operand, with bits [4:0] specifying which comparison
1922 /// operation to use: \n
1923 /// 0x00: Equal (ordered, non-signaling) \n
1924 /// 0x01: Less-than (ordered, signaling) \n
1925 /// 0x02: Less-than-or-equal (ordered, signaling) \n
1926 /// 0x03: Unordered (non-signaling) \n
1927 /// 0x04: Not-equal (unordered, non-signaling) \n
1928 /// 0x05: Not-less-than (unordered, signaling) \n
1929 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1930 /// 0x07: Ordered (non-signaling) \n
1931 /// 0x08: Equal (unordered, non-signaling) \n
1932 /// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1933 /// 0x0A: Not-greater-than (unordered, signaling) \n
1934 /// 0x0B: False (ordered, non-signaling) \n
1935 /// 0x0C: Not-equal (ordered, non-signaling) \n
1936 /// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1937 /// 0x0E: Greater-than (ordered, signaling) \n
1938 /// 0x0F: True (unordered, non-signaling) \n
1939 /// 0x10: Equal (ordered, signaling) \n
1940 /// 0x11: Less-than (ordered, non-signaling) \n
1941 /// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1942 /// 0x13: Unordered (signaling) \n
1943 /// 0x14: Not-equal (unordered, signaling) \n
1944 /// 0x15: Not-less-than (unordered, non-signaling) \n
1945 /// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1946 /// 0x17: Ordered (signaling) \n
1947 /// 0x18: Equal (unordered, signaling) \n
1948 /// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1949 /// 0x1A: Not-greater-than (unordered, non-signaling) \n
1950 /// 0x1B: False (ordered, signaling) \n
1951 /// 0x1C: Not-equal (ordered, signaling) \n
1952 /// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1953 /// 0x1E: Greater-than (ordered, non-signaling) \n
1954 /// 0x1F: True (unordered, signaling)
1955 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1956 #define _mm_cmp_ss(a, b, c) \
1957  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1958  (__v4sf)(__m128)(b), (c))
1959 
1960 /// Takes a [8 x i32] vector and returns the vector element value
1961 /// indexed by the immediate constant operand.
1962 ///
1963 /// \headerfile <x86intrin.h>
1964 ///
1965 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1966 /// instruction.
1967 ///
1968 /// \param __a
1969 /// A 256-bit vector of [8 x i32].
1970 /// \param __imm
1971 /// An immediate integer operand with bits [2:0] determining which vector
1972 /// element is extracted and returned.
1973 /// \returns A 32-bit integer containing the extracted 32 bits of extended
1974 /// packed data.
1975 #define _mm256_extract_epi32(X, N) \
1976  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
1977 
1978 /// Takes a [16 x i16] vector and returns the vector element value
1979 /// indexed by the immediate constant operand.
1980 ///
1981 /// \headerfile <x86intrin.h>
1982 ///
1983 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1984 /// instruction.
1985 ///
1986 /// \param __a
1987 /// A 256-bit integer vector of [16 x i16].
1988 /// \param __imm
1989 /// An immediate integer operand with bits [3:0] determining which vector
1990 /// element is extracted and returned.
1991 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
1992 /// packed data.
1993 #define _mm256_extract_epi16(X, N) \
1994  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
1995  (int)(N))
1996 
1997 /// Takes a [32 x i8] vector and returns the vector element value
1998 /// indexed by the immediate constant operand.
1999 ///
2000 /// \headerfile <x86intrin.h>
2001 ///
2002 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2003 /// instruction.
2004 ///
2005 /// \param __a
2006 /// A 256-bit integer vector of [32 x i8].
2007 /// \param __imm
2008 /// An immediate integer operand with bits [4:0] determining which vector
2009 /// element is extracted and returned.
2010 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2011 /// packed data.
2012 #define _mm256_extract_epi8(X, N) \
2013  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2014  (int)(N))
2015 
2016 #ifdef __x86_64__
2017 /// Takes a [4 x i64] vector and returns the vector element value
2018 /// indexed by the immediate constant operand.
2019 ///
2020 /// \headerfile <x86intrin.h>
2021 ///
2022 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2023 /// instruction.
2024 ///
2025 /// \param __a
2026 /// A 256-bit integer vector of [4 x i64].
2027 /// \param __imm
2028 /// An immediate integer operand with bits [1:0] determining which vector
2029 /// element is extracted and returned.
2030 /// \returns A 64-bit integer containing the extracted 64 bits of extended
2031 /// packed data.
2032 #define _mm256_extract_epi64(X, N) \
2033  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
2034 #endif
2035 
2036 /// Takes a [8 x i32] vector and replaces the vector element value
2037 /// indexed by the immediate constant operand by a new value. Returns the
2038 /// modified vector.
2039 ///
2040 /// \headerfile <x86intrin.h>
2041 ///
2042 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2043 /// instruction.
2044 ///
2045 /// \param __a
2046 /// A vector of [8 x i32] to be used by the insert operation.
2047 /// \param __b
2048 /// An integer value. The replacement value for the insert operation.
2049 /// \param __imm
2050 /// An immediate integer specifying the index of the vector element to be
2051 /// replaced.
2052 /// \returns A copy of vector \a __a, after replacing its element indexed by
2053 /// \a __imm with \a __b.
2054 #define _mm256_insert_epi32(X, I, N) \
2055  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2056  (int)(I), (int)(N))
2057 
2058 
2059 /// Takes a [16 x i16] vector and replaces the vector element value
2060 /// indexed by the immediate constant operand with a new value. Returns the
2061 /// modified vector.
2062 ///
2063 /// \headerfile <x86intrin.h>
2064 ///
2065 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2066 /// instruction.
2067 ///
2068 /// \param __a
2069 /// A vector of [16 x i16] to be used by the insert operation.
2070 /// \param __b
2071 /// An i16 integer value. The replacement value for the insert operation.
2072 /// \param __imm
2073 /// An immediate integer specifying the index of the vector element to be
2074 /// replaced.
2075 /// \returns A copy of vector \a __a, after replacing its element indexed by
2076 /// \a __imm with \a __b.
2077 #define _mm256_insert_epi16(X, I, N) \
2078  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2079  (int)(I), (int)(N))
2080 
2081 /// Takes a [32 x i8] vector and replaces the vector element value
2082 /// indexed by the immediate constant operand with a new value. Returns the
2083 /// modified vector.
2084 ///
2085 /// \headerfile <x86intrin.h>
2086 ///
2087 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2088 /// instruction.
2089 ///
2090 /// \param __a
2091 /// A vector of [32 x i8] to be used by the insert operation.
2092 /// \param __b
2093 /// An i8 integer value. The replacement value for the insert operation.
2094 /// \param __imm
2095 /// An immediate integer specifying the index of the vector element to be
2096 /// replaced.
2097 /// \returns A copy of vector \a __a, after replacing its element indexed by
2098 /// \a __imm with \a __b.
2099 #define _mm256_insert_epi8(X, I, N) \
2100  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2101  (int)(I), (int)(N))
2102 
2103 #ifdef __x86_64__
2104 /// Takes a [4 x i64] vector and replaces the vector element value
2105 /// indexed by the immediate constant operand with a new value. Returns the
2106 /// modified vector.
2107 ///
2108 /// \headerfile <x86intrin.h>
2109 ///
2110 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2111 /// instruction.
2112 ///
2113 /// \param __a
2114 /// A vector of [4 x i64] to be used by the insert operation.
2115 /// \param __b
2116 /// A 64-bit integer value. The replacement value for the insert operation.
2117 /// \param __imm
2118 /// An immediate integer specifying the index of the vector element to be
2119 /// replaced.
2120 /// \returns A copy of vector \a __a, after replacing its element indexed by
2121 /// \a __imm with \a __b.
2122 #define _mm256_insert_epi64(X, I, N) \
2123  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2124  (long long)(I), (int)(N))
2125 #endif
2126 
2127 /* Conversion */
2128 /// Converts a vector of [4 x i32] into a vector of [4 x double].
2129 ///
2130 /// \headerfile <x86intrin.h>
2131 ///
2132 /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2133 ///
2134 /// \param __a
2135 /// A 128-bit integer vector of [4 x i32].
2136 /// \returns A 256-bit vector of [4 x double] containing the converted values.
2137 static __inline __m256d __DEFAULT_FN_ATTRS
2139 {
2140  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2141 }
2142 
2143 /// Converts a vector of [8 x i32] into a vector of [8 x float].
2144 ///
2145 /// \headerfile <x86intrin.h>
2146 ///
2147 /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2148 ///
2149 /// \param __a
2150 /// A 256-bit integer vector.
2151 /// \returns A 256-bit vector of [8 x float] containing the converted values.
2152 static __inline __m256 __DEFAULT_FN_ATTRS
2154 {
2155  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2156 }
2157 
2158 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2159 /// [4 x float].
2160 ///
2161 /// \headerfile <x86intrin.h>
2162 ///
2163 /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2164 ///
2165 /// \param __a
2166 /// A 256-bit vector of [4 x double].
2167 /// \returns A 128-bit vector of [4 x float] containing the converted values.
2168 static __inline __m128 __DEFAULT_FN_ATTRS
2169 _mm256_cvtpd_ps(__m256d __a)
2170 {
2171  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2172 }
2173 
2174 /// Converts a vector of [8 x float] into a vector of [8 x i32].
2175 ///
2176 /// \headerfile <x86intrin.h>
2177 ///
2178 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2179 ///
2180 /// \param __a
2181 /// A 256-bit vector of [8 x float].
2182 /// \returns A 256-bit integer vector containing the converted values.
2183 static __inline __m256i __DEFAULT_FN_ATTRS
2185 {
2186  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2187 }
2188 
2189 /// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2190 /// x double].
2191 ///
2192 /// \headerfile <x86intrin.h>
2193 ///
2194 /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2195 ///
2196 /// \param __a
2197 /// A 128-bit vector of [4 x float].
2198 /// \returns A 256-bit vector of [4 x double] containing the converted values.
2199 static __inline __m256d __DEFAULT_FN_ATTRS
2200 _mm256_cvtps_pd(__m128 __a)
2201 {
2202  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2203 }
2204 
2205 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2206 /// x i32], truncating the result by rounding towards zero when it is
2207 /// inexact.
2208 ///
2209 /// \headerfile <x86intrin.h>
2210 ///
2211 /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2212 ///
2213 /// \param __a
2214 /// A 256-bit vector of [4 x double].
2215 /// \returns A 128-bit integer vector containing the converted values.
2216 static __inline __m128i __DEFAULT_FN_ATTRS
2218 {
2219  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2220 }
2221 
2222 /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2223 /// x i32]. When a conversion is inexact, the value returned is rounded
2224 /// according to the rounding control bits in the MXCSR register.
2225 ///
2226 /// \headerfile <x86intrin.h>
2227 ///
2228 /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2229 ///
2230 /// \param __a
2231 /// A 256-bit vector of [4 x double].
2232 /// \returns A 128-bit integer vector containing the converted values.
2233 static __inline __m128i __DEFAULT_FN_ATTRS
2235 {
2236  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2237 }
2238 
2239 /// Converts a vector of [8 x float] into a vector of [8 x i32],
2240 /// truncating the result by rounding towards zero when it is inexact.
2241 ///
2242 /// \headerfile <x86intrin.h>
2243 ///
2244 /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2245 ///
2246 /// \param __a
2247 /// A 256-bit vector of [8 x float].
2248 /// \returns A 256-bit integer vector containing the converted values.
2249 static __inline __m256i __DEFAULT_FN_ATTRS
2251 {
2252  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2253 }
2254 
2255 /// Returns the first element of the input vector of [4 x double].
2256 ///
2257 /// \headerfile <avxintrin.h>
2258 ///
2259 /// This intrinsic is a utility function and does not correspond to a specific
2260 /// instruction.
2261 ///
2262 /// \param __a
2263 /// A 256-bit vector of [4 x double].
2264 /// \returns A 64 bit double containing the first element of the input vector.
2265 static __inline double __DEFAULT_FN_ATTRS
2266 _mm256_cvtsd_f64(__m256d __a)
2267 {
2268  return __a[0];
2269 }
2270 
2271 /// Returns the first element of the input vector of [8 x i32].
2272 ///
2273 /// \headerfile <avxintrin.h>
2274 ///
2275 /// This intrinsic is a utility function and does not correspond to a specific
2276 /// instruction.
2277 ///
2278 /// \param __a
2279 /// A 256-bit vector of [8 x i32].
2280 /// \returns A 32 bit integer containing the first element of the input vector.
2281 static __inline int __DEFAULT_FN_ATTRS
2283 {
2284  __v8si __b = (__v8si)__a;
2285  return __b[0];
2286 }
2287 
2288 /// Returns the first element of the input vector of [8 x float].
2289 ///
2290 /// \headerfile <avxintrin.h>
2291 ///
2292 /// This intrinsic is a utility function and does not correspond to a specific
2293 /// instruction.
2294 ///
2295 /// \param __a
2296 /// A 256-bit vector of [8 x float].
2297 /// \returns A 32 bit float containing the first element of the input vector.
2298 static __inline float __DEFAULT_FN_ATTRS
2299 _mm256_cvtss_f32(__m256 __a)
2300 {
2301  return __a[0];
2302 }
2303 
2304 /* Vector replicate */
2305 /// Moves and duplicates odd-indexed values from a 256-bit vector of
2306 /// [8 x float] to float values in a 256-bit vector of [8 x float].
2307 ///
2308 /// \headerfile <x86intrin.h>
2309 ///
2310 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2311 ///
2312 /// \param __a
2313 /// A 256-bit vector of [8 x float]. \n
2314 /// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2315 /// the return value. \n
2316 /// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2317 /// the return value. \n
2318 /// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2319 /// return value. \n
2320 /// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2321 /// return value.
2322 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2323 /// values.
2324 static __inline __m256 __DEFAULT_FN_ATTRS
2326 {
2327  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2328 }
2329 
2330 /// Moves and duplicates even-indexed values from a 256-bit vector of
2331 /// [8 x float] to float values in a 256-bit vector of [8 x float].
2332 ///
2333 /// \headerfile <x86intrin.h>
2334 ///
2335 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2336 ///
2337 /// \param __a
2338 /// A 256-bit vector of [8 x float]. \n
2339 /// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2340 /// the return value. \n
2341 /// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2342 /// the return value. \n
2343 /// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2344 /// return value. \n
2345 /// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2346 /// return value.
2347 /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2348 /// values.
2349 static __inline __m256 __DEFAULT_FN_ATTRS
2351 {
2352  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2353 }
2354 
2355 /// Moves and duplicates double-precision floating point values from a
2356 /// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2357 /// vector of [4 x double].
2358 ///
2359 /// \headerfile <x86intrin.h>
2360 ///
2361 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2362 ///
2363 /// \param __a
2364 /// A 256-bit vector of [4 x double]. \n
2365 /// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2366 /// return value. \n
2367 /// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2368 /// the return value.
2369 /// \returns A 256-bit vector of [4 x double] containing the moved and
2370 /// duplicated values.
2371 static __inline __m256d __DEFAULT_FN_ATTRS
2372 _mm256_movedup_pd(__m256d __a)
2373 {
2374  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2375 }
2376 
2377 /* Unpack and Interleave */
2378 /// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2379 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2380 ///
2381 /// \headerfile <x86intrin.h>
2382 ///
2383 /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2384 ///
2385 /// \param __a
2386 /// A 256-bit floating-point vector of [4 x double]. \n
2387 /// Bits [127:64] are written to bits [63:0] of the return value. \n
2388 /// Bits [255:192] are written to bits [191:128] of the return value. \n
2389 /// \param __b
2390 /// A 256-bit floating-point vector of [4 x double]. \n
2391 /// Bits [127:64] are written to bits [127:64] of the return value. \n
2392 /// Bits [255:192] are written to bits [255:192] of the return value. \n
2393 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2394 static __inline __m256d __DEFAULT_FN_ATTRS
2395 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
2396 {
2397  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2398 }
2399 
2400 /// Unpacks the even-indexed vector elements from two 256-bit vectors of
2401 /// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2406 ///
2407 /// \param __a
2408 /// A 256-bit floating-point vector of [4 x double]. \n
2409 /// Bits [63:0] are written to bits [63:0] of the return value. \n
2410 /// Bits [191:128] are written to bits [191:128] of the return value.
2411 /// \param __b
2412 /// A 256-bit floating-point vector of [4 x double]. \n
2413 /// Bits [63:0] are written to bits [127:64] of the return value. \n
2414 /// Bits [191:128] are written to bits [255:192] of the return value. \n
2415 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2416 static __inline __m256d __DEFAULT_FN_ATTRS
2417 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
2418 {
2419  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2420 }
2421 
2422 /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2423 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2424 /// vector of [8 x float].
2425 ///
2426 /// \headerfile <x86intrin.h>
2427 ///
2428 /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2429 ///
2430 /// \param __a
2431 /// A 256-bit vector of [8 x float]. \n
2432 /// Bits [95:64] are written to bits [31:0] of the return value. \n
2433 /// Bits [127:96] are written to bits [95:64] of the return value. \n
2434 /// Bits [223:192] are written to bits [159:128] of the return value. \n
2435 /// Bits [255:224] are written to bits [223:192] of the return value.
2436 /// \param __b
2437 /// A 256-bit vector of [8 x float]. \n
2438 /// Bits [95:64] are written to bits [63:32] of the return value. \n
2439 /// Bits [127:96] are written to bits [127:96] of the return value. \n
2440 /// Bits [223:192] are written to bits [191:160] of the return value. \n
2441 /// Bits [255:224] are written to bits [255:224] of the return value.
2442 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2443 static __inline __m256 __DEFAULT_FN_ATTRS
2444 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
2445 {
2446  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2447 }
2448 
2449 /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2450 /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2451 /// vector of [8 x float].
2452 ///
2453 /// \headerfile <x86intrin.h>
2454 ///
2455 /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2456 ///
2457 /// \param __a
2458 /// A 256-bit vector of [8 x float]. \n
2459 /// Bits [31:0] are written to bits [31:0] of the return value. \n
2460 /// Bits [63:32] are written to bits [95:64] of the return value. \n
2461 /// Bits [159:128] are written to bits [159:128] of the return value. \n
2462 /// Bits [191:160] are written to bits [223:192] of the return value.
2463 /// \param __b
2464 /// A 256-bit vector of [8 x float]. \n
2465 /// Bits [31:0] are written to bits [63:32] of the return value. \n
2466 /// Bits [63:32] are written to bits [127:96] of the return value. \n
2467 /// Bits [159:128] are written to bits [191:160] of the return value. \n
2468 /// Bits [191:160] are written to bits [255:224] of the return value.
2469 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2470 static __inline __m256 __DEFAULT_FN_ATTRS
2471 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
2472 {
2473  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2474 }
2475 
2476 /* Bit Test */
2477 /// Given two 128-bit floating-point vectors of [2 x double], perform an
2478 /// element-by-element comparison of the double-precision element in the
2479 /// first source vector and the corresponding element in the second source
2480 /// vector.
2481 ///
2482 /// The EFLAGS register is updated as follows: \n
2483 /// If there is at least one pair of double-precision elements where the
2484 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2485 /// ZF flag is set to 1. \n
2486 /// If there is at least one pair of double-precision elements where the
2487 /// sign-bit of the first element is 0 and the sign-bit of the second element
2488 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2489 /// This intrinsic returns the value of the ZF flag.
2490 ///
2491 /// \headerfile <x86intrin.h>
2492 ///
2493 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2494 ///
2495 /// \param __a
2496 /// A 128-bit vector of [2 x double].
2497 /// \param __b
2498 /// A 128-bit vector of [2 x double].
2499 /// \returns the ZF flag in the EFLAGS register.
2500 static __inline int __DEFAULT_FN_ATTRS
2501 _mm_testz_pd(__m128d __a, __m128d __b)
2502 {
2503  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2504 }
2505 
2506 /// Given two 128-bit floating-point vectors of [2 x double], perform an
2507 /// element-by-element comparison of the double-precision element in the
2508 /// first source vector and the corresponding element in the second source
2509 /// vector.
2510 ///
2511 /// The EFLAGS register is updated as follows: \n
2512 /// If there is at least one pair of double-precision elements where the
2513 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2514 /// ZF flag is set to 1. \n
2515 /// If there is at least one pair of double-precision elements where the
2516 /// sign-bit of the first element is 0 and the sign-bit of the second element
2517 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2518 /// This intrinsic returns the value of the CF flag.
2519 ///
2520 /// \headerfile <x86intrin.h>
2521 ///
2522 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2523 ///
2524 /// \param __a
2525 /// A 128-bit vector of [2 x double].
2526 /// \param __b
2527 /// A 128-bit vector of [2 x double].
2528 /// \returns the CF flag in the EFLAGS register.
2529 static __inline int __DEFAULT_FN_ATTRS
2530 _mm_testc_pd(__m128d __a, __m128d __b)
2531 {
2532  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2533 }
2534 
2535 /// Given two 128-bit floating-point vectors of [2 x double], perform an
2536 /// element-by-element comparison of the double-precision element in the
2537 /// first source vector and the corresponding element in the second source
2538 /// vector.
2539 ///
2540 /// The EFLAGS register is updated as follows: \n
2541 /// If there is at least one pair of double-precision elements where the
2542 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2543 /// ZF flag is set to 1. \n
2544 /// If there is at least one pair of double-precision elements where the
2545 /// sign-bit of the first element is 0 and the sign-bit of the second element
2546 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2547 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2548 /// otherwise it returns 0.
2549 ///
2550 /// \headerfile <x86intrin.h>
2551 ///
2552 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2553 ///
2554 /// \param __a
2555 /// A 128-bit vector of [2 x double].
2556 /// \param __b
2557 /// A 128-bit vector of [2 x double].
2558 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2559 static __inline int __DEFAULT_FN_ATTRS
2560 _mm_testnzc_pd(__m128d __a, __m128d __b)
2561 {
2562  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2563 }
2564 
2565 /// Given two 128-bit floating-point vectors of [4 x float], perform an
2566 /// element-by-element comparison of the single-precision element in the
2567 /// first source vector and the corresponding element in the second source
2568 /// vector.
2569 ///
2570 /// The EFLAGS register is updated as follows: \n
2571 /// If there is at least one pair of single-precision elements where the
2572 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2573 /// ZF flag is set to 1. \n
2574 /// If there is at least one pair of single-precision elements where the
2575 /// sign-bit of the first element is 0 and the sign-bit of the second element
2576 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2577 /// This intrinsic returns the value of the ZF flag.
2578 ///
2579 /// \headerfile <x86intrin.h>
2580 ///
2581 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2582 ///
2583 /// \param __a
2584 /// A 128-bit vector of [4 x float].
2585 /// \param __b
2586 /// A 128-bit vector of [4 x float].
2587 /// \returns the ZF flag.
2588 static __inline int __DEFAULT_FN_ATTRS
2589 _mm_testz_ps(__m128 __a, __m128 __b)
2590 {
2591  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2592 }
2593 
2594 /// Given two 128-bit floating-point vectors of [4 x float], perform an
2595 /// element-by-element comparison of the single-precision element in the
2596 /// first source vector and the corresponding element in the second source
2597 /// vector.
2598 ///
2599 /// The EFLAGS register is updated as follows: \n
2600 /// If there is at least one pair of single-precision elements where the
2601 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2602 /// ZF flag is set to 1. \n
2603 /// If there is at least one pair of single-precision elements where the
2604 /// sign-bit of the first element is 0 and the sign-bit of the second element
2605 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2606 /// This intrinsic returns the value of the CF flag.
2607 ///
2608 /// \headerfile <x86intrin.h>
2609 ///
2610 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2611 ///
2612 /// \param __a
2613 /// A 128-bit vector of [4 x float].
2614 /// \param __b
2615 /// A 128-bit vector of [4 x float].
2616 /// \returns the CF flag.
2617 static __inline int __DEFAULT_FN_ATTRS
2618 _mm_testc_ps(__m128 __a, __m128 __b)
2619 {
2620  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2621 }
2622 
2623 /// Given two 128-bit floating-point vectors of [4 x float], perform an
2624 /// element-by-element comparison of the single-precision element in the
2625 /// first source vector and the corresponding element in the second source
2626 /// vector.
2627 ///
2628 /// The EFLAGS register is updated as follows: \n
2629 /// If there is at least one pair of single-precision elements where the
2630 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2631 /// ZF flag is set to 1. \n
2632 /// If there is at least one pair of single-precision elements where the
2633 /// sign-bit of the first element is 0 and the sign-bit of the second element
2634 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2635 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2636 /// otherwise it returns 0.
2637 ///
2638 /// \headerfile <x86intrin.h>
2639 ///
2640 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2641 ///
2642 /// \param __a
2643 /// A 128-bit vector of [4 x float].
2644 /// \param __b
2645 /// A 128-bit vector of [4 x float].
2646 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2647 static __inline int __DEFAULT_FN_ATTRS
2648 _mm_testnzc_ps(__m128 __a, __m128 __b)
2649 {
2650  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2651 }
2652 
2653 /// Given two 256-bit floating-point vectors of [4 x double], perform an
2654 /// element-by-element comparison of the double-precision elements in the
2655 /// first source vector and the corresponding elements in the second source
2656 /// vector.
2657 ///
2658 /// The EFLAGS register is updated as follows: \n
2659 /// If there is at least one pair of double-precision elements where the
2660 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2661 /// ZF flag is set to 1. \n
2662 /// If there is at least one pair of double-precision elements where the
2663 /// sign-bit of the first element is 0 and the sign-bit of the second element
2664 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2665 /// This intrinsic returns the value of the ZF flag.
2666 ///
2667 /// \headerfile <x86intrin.h>
2668 ///
2669 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2670 ///
2671 /// \param __a
2672 /// A 256-bit vector of [4 x double].
2673 /// \param __b
2674 /// A 256-bit vector of [4 x double].
2675 /// \returns the ZF flag.
2676 static __inline int __DEFAULT_FN_ATTRS
2677 _mm256_testz_pd(__m256d __a, __m256d __b)
2678 {
2679  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2680 }
2681 
2682 /// Given two 256-bit floating-point vectors of [4 x double], perform an
2683 /// element-by-element comparison of the double-precision elements in the
2684 /// first source vector and the corresponding elements in the second source
2685 /// vector.
2686 ///
2687 /// The EFLAGS register is updated as follows: \n
2688 /// If there is at least one pair of double-precision elements where the
2689 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2690 /// ZF flag is set to 1. \n
2691 /// If there is at least one pair of double-precision elements where the
2692 /// sign-bit of the first element is 0 and the sign-bit of the second element
2693 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2694 /// This intrinsic returns the value of the CF flag.
2695 ///
2696 /// \headerfile <x86intrin.h>
2697 ///
2698 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2699 ///
2700 /// \param __a
2701 /// A 256-bit vector of [4 x double].
2702 /// \param __b
2703 /// A 256-bit vector of [4 x double].
2704 /// \returns the CF flag.
2705 static __inline int __DEFAULT_FN_ATTRS
2706 _mm256_testc_pd(__m256d __a, __m256d __b)
2707 {
2708  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2709 }
2710 
2711 /// Given two 256-bit floating-point vectors of [4 x double], perform an
2712 /// element-by-element comparison of the double-precision elements in the
2713 /// first source vector and the corresponding elements in the second source
2714 /// vector.
2715 ///
2716 /// The EFLAGS register is updated as follows: \n
2717 /// If there is at least one pair of double-precision elements where the
2718 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2719 /// ZF flag is set to 1. \n
2720 /// If there is at least one pair of double-precision elements where the
2721 /// sign-bit of the first element is 0 and the sign-bit of the second element
2722 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2723 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2724 /// otherwise it returns 0.
2725 ///
2726 /// \headerfile <x86intrin.h>
2727 ///
2728 /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2729 ///
2730 /// \param __a
2731 /// A 256-bit vector of [4 x double].
2732 /// \param __b
2733 /// A 256-bit vector of [4 x double].
2734 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2735 static __inline int __DEFAULT_FN_ATTRS
2736 _mm256_testnzc_pd(__m256d __a, __m256d __b)
2737 {
2738  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2739 }
2740 
2741 /// Given two 256-bit floating-point vectors of [8 x float], perform an
2742 /// element-by-element comparison of the single-precision element in the
2743 /// first source vector and the corresponding element in the second source
2744 /// vector.
2745 ///
2746 /// The EFLAGS register is updated as follows: \n
2747 /// If there is at least one pair of single-precision elements where the
2748 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2749 /// ZF flag is set to 1. \n
2750 /// If there is at least one pair of single-precision elements where the
2751 /// sign-bit of the first element is 0 and the sign-bit of the second element
2752 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2753 /// This intrinsic returns the value of the ZF flag.
2754 ///
2755 /// \headerfile <x86intrin.h>
2756 ///
2757 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2758 ///
2759 /// \param __a
2760 /// A 256-bit vector of [8 x float].
2761 /// \param __b
2762 /// A 256-bit vector of [8 x float].
2763 /// \returns the ZF flag.
2764 static __inline int __DEFAULT_FN_ATTRS
2765 _mm256_testz_ps(__m256 __a, __m256 __b)
2766 {
2767  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2768 }
2769 
2770 /// Given two 256-bit floating-point vectors of [8 x float], perform an
2771 /// element-by-element comparison of the single-precision element in the
2772 /// first source vector and the corresponding element in the second source
2773 /// vector.
2774 ///
2775 /// The EFLAGS register is updated as follows: \n
2776 /// If there is at least one pair of single-precision elements where the
2777 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2778 /// ZF flag is set to 1. \n
2779 /// If there is at least one pair of single-precision elements where the
2780 /// sign-bit of the first element is 0 and the sign-bit of the second element
2781 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2782 /// This intrinsic returns the value of the CF flag.
2783 ///
2784 /// \headerfile <x86intrin.h>
2785 ///
2786 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2787 ///
2788 /// \param __a
2789 /// A 256-bit vector of [8 x float].
2790 /// \param __b
2791 /// A 256-bit vector of [8 x float].
2792 /// \returns the CF flag.
2793 static __inline int __DEFAULT_FN_ATTRS
2794 _mm256_testc_ps(__m256 __a, __m256 __b)
2795 {
2796  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2797 }
2798 
2799 /// Given two 256-bit floating-point vectors of [8 x float], perform an
2800 /// element-by-element comparison of the single-precision elements in the
2801 /// first source vector and the corresponding elements in the second source
2802 /// vector.
2803 ///
2804 /// The EFLAGS register is updated as follows: \n
2805 /// If there is at least one pair of single-precision elements where the
2806 /// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2807 /// ZF flag is set to 1. \n
2808 /// If there is at least one pair of single-precision elements where the
2809 /// sign-bit of the first element is 0 and the sign-bit of the second element
2810 /// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2811 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2812 /// otherwise it returns 0.
2813 ///
2814 /// \headerfile <x86intrin.h>
2815 ///
2816 /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2817 ///
2818 /// \param __a
2819 /// A 256-bit vector of [8 x float].
2820 /// \param __b
2821 /// A 256-bit vector of [8 x float].
2822 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2823 static __inline int __DEFAULT_FN_ATTRS
2824 _mm256_testnzc_ps(__m256 __a, __m256 __b)
2825 {
2826  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2827 }
2828 
2829 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2830 /// of the two source vectors.
2831 ///
2832 /// The EFLAGS register is updated as follows: \n
2833 /// If there is at least one pair of bits where both bits are 1, the ZF flag
2834 /// is set to 0. Otherwise the ZF flag is set to 1. \n
2835 /// If there is at least one pair of bits where the bit from the first source
2836 /// vector is 0 and the bit from the second source vector is 1, the CF flag
2837 /// is set to 0. Otherwise the CF flag is set to 1. \n
2838 /// This intrinsic returns the value of the ZF flag.
2839 ///
2840 /// \headerfile <x86intrin.h>
2841 ///
2842 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2843 ///
2844 /// \param __a
2845 /// A 256-bit integer vector.
2846 /// \param __b
2847 /// A 256-bit integer vector.
2848 /// \returns the ZF flag.
2849 static __inline int __DEFAULT_FN_ATTRS
2850 _mm256_testz_si256(__m256i __a, __m256i __b)
2851 {
2852  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2853 }
2854 
2855 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2856 /// of the two source vectors.
2857 ///
2858 /// The EFLAGS register is updated as follows: \n
2859 /// If there is at least one pair of bits where both bits are 1, the ZF flag
2860 /// is set to 0. Otherwise the ZF flag is set to 1. \n
2861 /// If there is at least one pair of bits where the bit from the first source
2862 /// vector is 0 and the bit from the second source vector is 1, the CF flag
2863 /// is set to 0. Otherwise the CF flag is set to 1. \n
2864 /// This intrinsic returns the value of the CF flag.
2865 ///
2866 /// \headerfile <x86intrin.h>
2867 ///
2868 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2869 ///
2870 /// \param __a
2871 /// A 256-bit integer vector.
2872 /// \param __b
2873 /// A 256-bit integer vector.
2874 /// \returns the CF flag.
2875 static __inline int __DEFAULT_FN_ATTRS
2876 _mm256_testc_si256(__m256i __a, __m256i __b)
2877 {
2878  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2879 }
2880 
2881 /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2882 /// of the two source vectors.
2883 ///
2884 /// The EFLAGS register is updated as follows: \n
2885 /// If there is at least one pair of bits where both bits are 1, the ZF flag
2886 /// is set to 0. Otherwise the ZF flag is set to 1. \n
2887 /// If there is at least one pair of bits where the bit from the first source
2888 /// vector is 0 and the bit from the second source vector is 1, the CF flag
2889 /// is set to 0. Otherwise the CF flag is set to 1. \n
2890 /// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2891 /// otherwise it returns 0.
2892 ///
2893 /// \headerfile <x86intrin.h>
2894 ///
2895 /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2896 ///
2897 /// \param __a
2898 /// A 256-bit integer vector.
2899 /// \param __b
2900 /// A 256-bit integer vector.
2901 /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2902 static __inline int __DEFAULT_FN_ATTRS
2903 _mm256_testnzc_si256(__m256i __a, __m256i __b)
2904 {
2905  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2906 }
2907 
2908 /* Vector extract sign mask */
2909 /// Extracts the sign bits of double-precision floating point elements
2910 /// in a 256-bit vector of [4 x double] and writes them to the lower order
2911 /// bits of the return value.
2912 ///
2913 /// \headerfile <x86intrin.h>
2914 ///
2915 /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2916 ///
2917 /// \param __a
2918 /// A 256-bit vector of [4 x double] containing the double-precision
2919 /// floating point values with sign bits to be extracted.
2920 /// \returns The sign bits from the operand, written to bits [3:0].
2921 static __inline int __DEFAULT_FN_ATTRS
2923 {
2924  return __builtin_ia32_movmskpd256((__v4df)__a);
2925 }
2926 
2927 /// Extracts the sign bits of single-precision floating point elements
2928 /// in a 256-bit vector of [8 x float] and writes them to the lower order
2929 /// bits of the return value.
2930 ///
2931 /// \headerfile <x86intrin.h>
2932 ///
2933 /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2934 ///
2935 /// \param __a
2936 /// A 256-bit vector of [8 x float] containing the single-precision floating
2937 /// point values with sign bits to be extracted.
2938 /// \returns The sign bits from the operand, written to bits [7:0].
2939 static __inline int __DEFAULT_FN_ATTRS
2941 {
2942  return __builtin_ia32_movmskps256((__v8sf)__a);
2943 }
2944 
2945 /* Vector __zero */
2946 /// Zeroes the contents of all XMM or YMM registers.
2947 ///
2948 /// \headerfile <x86intrin.h>
2949 ///
2950 /// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2951 static __inline void __DEFAULT_FN_ATTRS
2953 {
2954  __builtin_ia32_vzeroall();
2955 }
2956 
2957 /// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2958 ///
2959 /// \headerfile <x86intrin.h>
2960 ///
2961 /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
2962 static __inline void __DEFAULT_FN_ATTRS
2964 {
2965  __builtin_ia32_vzeroupper();
2966 }
2967 
2968 /* Vector load with broadcast */
2969 /// Loads a scalar single-precision floating point value from the
2970 /// specified address pointed to by \a __a and broadcasts it to the elements
2971 /// of a [4 x float] vector.
2972 ///
2973 /// \headerfile <x86intrin.h>
2974 ///
2975 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
2976 ///
2977 /// \param __a
2978 /// The single-precision floating point value to be broadcast.
2979 /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
2980 /// equal to the broadcast value.
2981 static __inline __m128 __DEFAULT_FN_ATTRS
2982 _mm_broadcast_ss(float const *__a)
2983 {
2984  float __f = *__a;
2985  return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
2986 }
2987 
2988 /// Loads a scalar double-precision floating point value from the
2989 /// specified address pointed to by \a __a and broadcasts it to the elements
2990 /// of a [4 x double] vector.
2991 ///
2992 /// \headerfile <x86intrin.h>
2993 ///
2994 /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
2995 ///
2996 /// \param __a
2997 /// The double-precision floating point value to be broadcast.
2998 /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
2999 /// equal to the broadcast value.
3000 static __inline __m256d __DEFAULT_FN_ATTRS
3001 _mm256_broadcast_sd(double const *__a)
3002 {
3003  double __d = *__a;
3004  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3005 }
3006 
3007 /// Loads a scalar single-precision floating point value from the
3008 /// specified address pointed to by \a __a and broadcasts it to the elements
3009 /// of a [8 x float] vector.
3010 ///
3011 /// \headerfile <x86intrin.h>
3012 ///
3013 /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3014 ///
3015 /// \param __a
3016 /// The single-precision floating point value to be broadcast.
3017 /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3018 /// equal to the broadcast value.
3019 static __inline __m256 __DEFAULT_FN_ATTRS
3020 _mm256_broadcast_ss(float const *__a)
3021 {
3022  float __f = *__a;
3023  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3024 }
3025 
3026 /// Loads the data from a 128-bit vector of [2 x double] from the
3027 /// specified address pointed to by \a __a and broadcasts it to 128-bit
3028 /// elements in a 256-bit vector of [4 x double].
3029 ///
3030 /// \headerfile <x86intrin.h>
3031 ///
3032 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3033 ///
3034 /// \param __a
3035 /// The 128-bit vector of [2 x double] to be broadcast.
3036 /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3037 /// equal to the broadcast value.
3038 static __inline __m256d __DEFAULT_FN_ATTRS
3039 _mm256_broadcast_pd(__m128d const *__a)
3040 {
3041  __m128d __b = _mm_loadu_pd((const double *)__a);
3042  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3043  0, 1, 0, 1);
3044 }
3045 
3046 /// Loads the data from a 128-bit vector of [4 x float] from the
3047 /// specified address pointed to by \a __a and broadcasts it to 128-bit
3048 /// elements in a 256-bit vector of [8 x float].
3049 ///
3050 /// \headerfile <x86intrin.h>
3051 ///
3052 /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3053 ///
3054 /// \param __a
3055 /// The 128-bit vector of [4 x float] to be broadcast.
3056 /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3057 /// equal to the broadcast value.
3058 static __inline __m256 __DEFAULT_FN_ATTRS
3059 _mm256_broadcast_ps(__m128 const *__a)
3060 {
3061  __m128 __b = _mm_loadu_ps((const float *)__a);
3062  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3063  0, 1, 2, 3, 0, 1, 2, 3);
3064 }
3065 
3066 /* SIMD load ops */
3067 /// Loads 4 double-precision floating point values from a 32-byte aligned
3068 /// memory location pointed to by \a __p into a vector of [4 x double].
3069 ///
3070 /// \headerfile <x86intrin.h>
3071 ///
3072 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3073 ///
3074 /// \param __p
3075 /// A 32-byte aligned pointer to a memory location containing
3076 /// double-precision floating point values.
3077 /// \returns A 256-bit vector of [4 x double] containing the moved values.
3078 static __inline __m256d __DEFAULT_FN_ATTRS
3079 _mm256_load_pd(double const *__p)
3080 {
3081  return *(__m256d *)__p;
3082 }
3083 
3084 /// Loads 8 single-precision floating point values from a 32-byte aligned
3085 /// memory location pointed to by \a __p into a vector of [8 x float].
3086 ///
3087 /// \headerfile <x86intrin.h>
3088 ///
3089 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3090 ///
3091 /// \param __p
3092 /// A 32-byte aligned pointer to a memory location containing float values.
3093 /// \returns A 256-bit vector of [8 x float] containing the moved values.
3094 static __inline __m256 __DEFAULT_FN_ATTRS
3095 _mm256_load_ps(float const *__p)
3096 {
3097  return *(__m256 *)__p;
3098 }
3099 
3100 /// Loads 4 double-precision floating point values from an unaligned
3101 /// memory location pointed to by \a __p into a vector of [4 x double].
3102 ///
3103 /// \headerfile <x86intrin.h>
3104 ///
3105 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3106 ///
3107 /// \param __p
3108 /// A pointer to a memory location containing double-precision floating
3109 /// point values.
3110 /// \returns A 256-bit vector of [4 x double] containing the moved values.
3111 static __inline __m256d __DEFAULT_FN_ATTRS
3112 _mm256_loadu_pd(double const *__p)
3113 {
3114  struct __loadu_pd {
3115  __m256d __v;
3116  } __attribute__((__packed__, __may_alias__));
3117  return ((struct __loadu_pd*)__p)->__v;
3118 }
3119 
3120 /// Loads 8 single-precision floating point values from an unaligned
3121 /// memory location pointed to by \a __p into a vector of [8 x float].
3122 ///
3123 /// \headerfile <x86intrin.h>
3124 ///
3125 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3126 ///
3127 /// \param __p
3128 /// A pointer to a memory location containing single-precision floating
3129 /// point values.
3130 /// \returns A 256-bit vector of [8 x float] containing the moved values.
3131 static __inline __m256 __DEFAULT_FN_ATTRS
3132 _mm256_loadu_ps(float const *__p)
3133 {
3134  struct __loadu_ps {
3135  __m256 __v;
3136  } __attribute__((__packed__, __may_alias__));
3137  return ((struct __loadu_ps*)__p)->__v;
3138 }
3139 
3140 /// Loads 256 bits of integer data from a 32-byte aligned memory
3141 /// location pointed to by \a __p into elements of a 256-bit integer vector.
3142 ///
3143 /// \headerfile <x86intrin.h>
3144 ///
3145 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3146 ///
3147 /// \param __p
3148 /// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3149 /// values.
3150 /// \returns A 256-bit integer vector containing the moved values.
3151 static __inline __m256i __DEFAULT_FN_ATTRS
3152 _mm256_load_si256(__m256i const *__p)
3153 {
3154  return *__p;
3155 }
3156 
3157 /// Loads 256 bits of integer data from an unaligned memory location
3158 /// pointed to by \a __p into a 256-bit integer vector.
3159 ///
3160 /// \headerfile <x86intrin.h>
3161 ///
3162 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3163 ///
3164 /// \param __p
3165 /// A pointer to a 256-bit integer vector containing integer values.
3166 /// \returns A 256-bit integer vector containing the moved values.
3167 static __inline __m256i __DEFAULT_FN_ATTRS
3168 _mm256_loadu_si256(__m256i const *__p)
3169 {
3170  struct __loadu_si256 {
3171  __m256i __v;
3172  } __attribute__((__packed__, __may_alias__));
3173  return ((struct __loadu_si256*)__p)->__v;
3174 }
3175 
3176 /// Loads 256 bits of integer data from an unaligned memory location
3177 /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3178 /// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3179 /// line boundary.
3180 ///
3181 /// \headerfile <x86intrin.h>
3182 ///
3183 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3184 ///
3185 /// \param __p
3186 /// A pointer to a 256-bit integer vector containing integer values.
3187 /// \returns A 256-bit integer vector containing the moved values.
3188 static __inline __m256i __DEFAULT_FN_ATTRS
3189 _mm256_lddqu_si256(__m256i const *__p)
3190 {
3191  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3192 }
3193 
3194 /* SIMD store ops */
3195 /// Stores double-precision floating point values from a 256-bit vector
3196 /// of [4 x double] to a 32-byte aligned memory location pointed to by
3197 /// \a __p.
3198 ///
3199 /// \headerfile <x86intrin.h>
3200 ///
3201 /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3202 ///
3203 /// \param __p
3204 /// A 32-byte aligned pointer to a memory location that will receive the
3205 /// double-precision floaing point values.
3206 /// \param __a
3207 /// A 256-bit vector of [4 x double] containing the values to be moved.
3208 static __inline void __DEFAULT_FN_ATTRS
3209 _mm256_store_pd(double *__p, __m256d __a)
3210 {
3211  *(__m256d *)__p = __a;
3212 }
3213 
3214 /// Stores single-precision floating point values from a 256-bit vector
3215 /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3216 ///
3217 /// \headerfile <x86intrin.h>
3218 ///
3219 /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3220 ///
3221 /// \param __p
3222 /// A 32-byte aligned pointer to a memory location that will receive the
3223 /// float values.
3224 /// \param __a
3225 /// A 256-bit vector of [8 x float] containing the values to be moved.
3226 static __inline void __DEFAULT_FN_ATTRS
3227 _mm256_store_ps(float *__p, __m256 __a)
3228 {
3229  *(__m256 *)__p = __a;
3230 }
3231 
3232 /// Stores double-precision floating point values from a 256-bit vector
3233 /// of [4 x double] to an unaligned memory location pointed to by \a __p.
3234 ///
3235 /// \headerfile <x86intrin.h>
3236 ///
3237 /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3238 ///
3239 /// \param __p
3240 /// A pointer to a memory location that will receive the double-precision
3241 /// floating point values.
3242 /// \param __a
3243 /// A 256-bit vector of [4 x double] containing the values to be moved.
3244 static __inline void __DEFAULT_FN_ATTRS
3245 _mm256_storeu_pd(double *__p, __m256d __a)
3246 {
3247  struct __storeu_pd {
3248  __m256d __v;
3249  } __attribute__((__packed__, __may_alias__));
3250  ((struct __storeu_pd*)__p)->__v = __a;
3251 }
3252 
3253 /// Stores single-precision floating point values from a 256-bit vector
3254 /// of [8 x float] to an unaligned memory location pointed to by \a __p.
3255 ///
3256 /// \headerfile <x86intrin.h>
3257 ///
3258 /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3259 ///
3260 /// \param __p
3261 /// A pointer to a memory location that will receive the float values.
3262 /// \param __a
3263 /// A 256-bit vector of [8 x float] containing the values to be moved.
3264 static __inline void __DEFAULT_FN_ATTRS
3265 _mm256_storeu_ps(float *__p, __m256 __a)
3266 {
3267  struct __storeu_ps {
3268  __m256 __v;
3269  } __attribute__((__packed__, __may_alias__));
3270  ((struct __storeu_ps*)__p)->__v = __a;
3271 }
3272 
3273 /// Stores integer values from a 256-bit integer vector to a 32-byte
3274 /// aligned memory location pointed to by \a __p.
3275 ///
3276 /// \headerfile <x86intrin.h>
3277 ///
3278 /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3279 ///
3280 /// \param __p
3281 /// A 32-byte aligned pointer to a memory location that will receive the
3282 /// integer values.
3283 /// \param __a
3284 /// A 256-bit integer vector containing the values to be moved.
3285 static __inline void __DEFAULT_FN_ATTRS
3286 _mm256_store_si256(__m256i *__p, __m256i __a)
3287 {
3288  *__p = __a;
3289 }
3290 
3291 /// Stores integer values from a 256-bit integer vector to an unaligned
3292 /// memory location pointed to by \a __p.
3293 ///
3294 /// \headerfile <x86intrin.h>
3295 ///
3296 /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3297 ///
3298 /// \param __p
3299 /// A pointer to a memory location that will receive the integer values.
3300 /// \param __a
3301 /// A 256-bit integer vector containing the values to be moved.
3302 static __inline void __DEFAULT_FN_ATTRS
3303 _mm256_storeu_si256(__m256i *__p, __m256i __a)
3304 {
3305  struct __storeu_si256 {
3306  __m256i __v;
3307  } __attribute__((__packed__, __may_alias__));
3308  ((struct __storeu_si256*)__p)->__v = __a;
3309 }
3310 
3311 /* Conditional load ops */
3312 /// Conditionally loads double-precision floating point elements from a
3313 /// memory location pointed to by \a __p into a 128-bit vector of
3314 /// [2 x double], depending on the mask bits associated with each data
3315 /// element.
3316 ///
3317 /// \headerfile <x86intrin.h>
3318 ///
3319 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3320 ///
3321 /// \param __p
3322 /// A pointer to a memory location that contains the double-precision
3323 /// floating point values.
3324 /// \param __m
3325 /// A 128-bit integer vector containing the mask. The most significant bit of
3326 /// each data element represents the mask bits. If a mask bit is zero, the
3327 /// corresponding value in the memory location is not loaded and the
3328 /// corresponding field in the return value is set to zero.
3329 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
3330 static __inline __m128d __DEFAULT_FN_ATTRS
3331 _mm_maskload_pd(double const *__p, __m128i __m)
3332 {
3333  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3334 }
3335 
3336 /// Conditionally loads double-precision floating point elements from a
3337 /// memory location pointed to by \a __p into a 256-bit vector of
3338 /// [4 x double], depending on the mask bits associated with each data
3339 /// element.
3340 ///
3341 /// \headerfile <x86intrin.h>
3342 ///
3343 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3344 ///
3345 /// \param __p
3346 /// A pointer to a memory location that contains the double-precision
3347 /// floating point values.
3348 /// \param __m
3349 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3350 /// significant bit of each quadword element represents the mask bits. If a
3351 /// mask bit is zero, the corresponding value in the memory location is not
3352 /// loaded and the corresponding field in the return value is set to zero.
3353 /// \returns A 256-bit vector of [4 x double] containing the loaded values.
3354 static __inline __m256d __DEFAULT_FN_ATTRS
3355 _mm256_maskload_pd(double const *__p, __m256i __m)
3356 {
3357  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3358  (__v4di)__m);
3359 }
3360 
3361 /// Conditionally loads single-precision floating point elements from a
3362 /// memory location pointed to by \a __p into a 128-bit vector of
3363 /// [4 x float], depending on the mask bits associated with each data
3364 /// element.
3365 ///
3366 /// \headerfile <x86intrin.h>
3367 ///
3368 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3369 ///
3370 /// \param __p
3371 /// A pointer to a memory location that contains the single-precision
3372 /// floating point values.
3373 /// \param __m
3374 /// A 128-bit integer vector containing the mask. The most significant bit of
3375 /// each data element represents the mask bits. If a mask bit is zero, the
3376 /// corresponding value in the memory location is not loaded and the
3377 /// corresponding field in the return value is set to zero.
3378 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
3379 static __inline __m128 __DEFAULT_FN_ATTRS
3380 _mm_maskload_ps(float const *__p, __m128i __m)
3381 {
3382  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3383 }
3384 
3385 /// Conditionally loads single-precision floating point elements from a
3386 /// memory location pointed to by \a __p into a 256-bit vector of
3387 /// [8 x float], depending on the mask bits associated with each data
3388 /// element.
3389 ///
3390 /// \headerfile <x86intrin.h>
3391 ///
3392 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3393 ///
3394 /// \param __p
3395 /// A pointer to a memory location that contains the single-precision
3396 /// floating point values.
3397 /// \param __m
3398 /// A 256-bit integer vector of [8 x dword] containing the mask. The most
3399 /// significant bit of each dword element represents the mask bits. If a mask
3400 /// bit is zero, the corresponding value in the memory location is not loaded
3401 /// and the corresponding field in the return value is set to zero.
3402 /// \returns A 256-bit vector of [8 x float] containing the loaded values.
3403 static __inline __m256 __DEFAULT_FN_ATTRS
3404 _mm256_maskload_ps(float const *__p, __m256i __m)
3405 {
3406  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3407 }
3408 
3409 /* Conditional store ops */
3410 /// Moves single-precision floating point values from a 256-bit vector
3411 /// of [8 x float] to a memory location pointed to by \a __p, according to
3412 /// the specified mask.
3413 ///
3414 /// \headerfile <x86intrin.h>
3415 ///
3416 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3417 ///
3418 /// \param __p
3419 /// A pointer to a memory location that will receive the float values.
3420 /// \param __m
3421 /// A 256-bit integer vector of [8 x dword] containing the mask. The most
3422 /// significant bit of each dword element in the mask vector represents the
3423 /// mask bits. If a mask bit is zero, the corresponding value from vector
3424 /// \a __a is not stored and the corresponding field in the memory location
3425 /// pointed to by \a __p is not changed.
3426 /// \param __a
3427 /// A 256-bit vector of [8 x float] containing the values to be stored.
3428 static __inline void __DEFAULT_FN_ATTRS
3429 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3430 {
3431  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3432 }
3433 
3434 /// Moves double-precision values from a 128-bit vector of [2 x double]
3435 /// to a memory location pointed to by \a __p, according to the specified
3436 /// mask.
3437 ///
3438 /// \headerfile <x86intrin.h>
3439 ///
3440 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3441 ///
3442 /// \param __p
3443 /// A pointer to a memory location that will receive the float values.
3444 /// \param __m
3445 /// A 128-bit integer vector containing the mask. The most significant bit of
3446 /// each field in the mask vector represents the mask bits. If a mask bit is
3447 /// zero, the corresponding value from vector \a __a is not stored and the
3448 /// corresponding field in the memory location pointed to by \a __p is not
3449 /// changed.
3450 /// \param __a
3451 /// A 128-bit vector of [2 x double] containing the values to be stored.
3452 static __inline void __DEFAULT_FN_ATTRS
3453 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3454 {
3455  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3456 }
3457 
3458 /// Moves double-precision values from a 256-bit vector of [4 x double]
3459 /// to a memory location pointed to by \a __p, according to the specified
3460 /// mask.
3461 ///
3462 /// \headerfile <x86intrin.h>
3463 ///
3464 /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3465 ///
3466 /// \param __p
3467 /// A pointer to a memory location that will receive the float values.
3468 /// \param __m
3469 /// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3470 /// significant bit of each quadword element in the mask vector represents
3471 /// the mask bits. If a mask bit is zero, the corresponding value from vector
3472 /// __a is not stored and the corresponding field in the memory location
3473 /// pointed to by \a __p is not changed.
3474 /// \param __a
3475 /// A 256-bit vector of [4 x double] containing the values to be stored.
3476 static __inline void __DEFAULT_FN_ATTRS
3477 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3478 {
3479  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3480 }
3481 
3482 /// Moves single-precision floating point values from a 128-bit vector
3483 /// of [4 x float] to a memory location pointed to by \a __p, according to
3484 /// the specified mask.
3485 ///
3486 /// \headerfile <x86intrin.h>
3487 ///
3488 /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3489 ///
3490 /// \param __p
3491 /// A pointer to a memory location that will receive the float values.
3492 /// \param __m
3493 /// A 128-bit integer vector containing the mask. The most significant bit of
3494 /// each field in the mask vector represents the mask bits. If a mask bit is
3495 /// zero, the corresponding value from vector __a is not stored and the
3496 /// corresponding field in the memory location pointed to by \a __p is not
3497 /// changed.
3498 /// \param __a
3499 /// A 128-bit vector of [4 x float] containing the values to be stored.
3500 static __inline void __DEFAULT_FN_ATTRS
3501 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3502 {
3503  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3504 }
3505 
3506 /* Cacheability support ops */
3507 /// Moves integer data from a 256-bit integer vector to a 32-byte
3508 /// aligned memory location. To minimize caching, the data is flagged as
3509 /// non-temporal (unlikely to be used again soon).
3510 ///
3511 /// \headerfile <x86intrin.h>
3512 ///
3513 /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3514 ///
3515 /// \param __a
3516 /// A pointer to a 32-byte aligned memory location that will receive the
3517 /// integer values.
3518 /// \param __b
3519 /// A 256-bit integer vector containing the values to be moved.
3520 static __inline void __DEFAULT_FN_ATTRS
3521 _mm256_stream_si256(__m256i *__a, __m256i __b)
3522 {
3523  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3524  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3525 }
3526 
3527 /// Moves double-precision values from a 256-bit vector of [4 x double]
3528 /// to a 32-byte aligned memory location. To minimize caching, the data is
3529 /// flagged as non-temporal (unlikely to be used again soon).
3530 ///
3531 /// \headerfile <x86intrin.h>
3532 ///
3533 /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3534 ///
3535 /// \param __a
3536 /// A pointer to a 32-byte aligned memory location that will receive the
3537 /// double-precision floating-point values.
3538 /// \param __b
3539 /// A 256-bit vector of [4 x double] containing the values to be moved.
3540 static __inline void __DEFAULT_FN_ATTRS
3541 _mm256_stream_pd(double *__a, __m256d __b)
3542 {
3543  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3544  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3545 }
3546 
3547 /// Moves single-precision floating point values from a 256-bit vector
3548 /// of [8 x float] to a 32-byte aligned memory location. To minimize
3549 /// caching, the data is flagged as non-temporal (unlikely to be used again
3550 /// soon).
3551 ///
3552 /// \headerfile <x86intrin.h>
3553 ///
3554 /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3555 ///
3556 /// \param __p
3557 /// A pointer to a 32-byte aligned memory location that will receive the
3558 /// single-precision floating point values.
3559 /// \param __a
3560 /// A 256-bit vector of [8 x float] containing the values to be moved.
3561 static __inline void __DEFAULT_FN_ATTRS
3562 _mm256_stream_ps(float *__p, __m256 __a)
3563 {
3564  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3565  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3566 }
3567 
3568 /* Create vectors */
3569 /// Create a 256-bit vector of [4 x double] with undefined values.
3570 ///
3571 /// \headerfile <x86intrin.h>
3572 ///
3573 /// This intrinsic has no corresponding instruction.
3574 ///
3575 /// \returns A 256-bit vector of [4 x double] containing undefined values.
3576 static __inline__ __m256d __DEFAULT_FN_ATTRS
3578 {
3579  return (__m256d)__builtin_ia32_undef256();
3580 }
3581 
3582 /// Create a 256-bit vector of [8 x float] with undefined values.
3583 ///
3584 /// \headerfile <x86intrin.h>
3585 ///
3586 /// This intrinsic has no corresponding instruction.
3587 ///
3588 /// \returns A 256-bit vector of [8 x float] containing undefined values.
3589 static __inline__ __m256 __DEFAULT_FN_ATTRS
3591 {
3592  return (__m256)__builtin_ia32_undef256();
3593 }
3594 
3595 /// Create a 256-bit integer vector with undefined values.
3596 ///
3597 /// \headerfile <x86intrin.h>
3598 ///
3599 /// This intrinsic has no corresponding instruction.
3600 ///
3601 /// \returns A 256-bit integer vector containing undefined values.
3602 static __inline__ __m256i __DEFAULT_FN_ATTRS
3604 {
3605  return (__m256i)__builtin_ia32_undef256();
3606 }
3607 
3608 /// Constructs a 256-bit floating-point vector of [4 x double]
3609 /// initialized with the specified double-precision floating-point values.
3610 ///
3611 /// \headerfile <x86intrin.h>
3612 ///
3613 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3614 /// instruction.
3615 ///
3616 /// \param __a
3617 /// A double-precision floating-point value used to initialize bits [255:192]
3618 /// of the result.
3619 /// \param __b
3620 /// A double-precision floating-point value used to initialize bits [191:128]
3621 /// of the result.
3622 /// \param __c
3623 /// A double-precision floating-point value used to initialize bits [127:64]
3624 /// of the result.
3625 /// \param __d
3626 /// A double-precision floating-point value used to initialize bits [63:0]
3627 /// of the result.
3628 /// \returns An initialized 256-bit floating-point vector of [4 x double].
3629 static __inline __m256d __DEFAULT_FN_ATTRS
3630 _mm256_set_pd(double __a, double __b, double __c, double __d)
3631 {
3632  return __extension__ (__m256d){ __d, __c, __b, __a };
3633 }
3634 
3635 /// Constructs a 256-bit floating-point vector of [8 x float] initialized
3636 /// with the specified single-precision floating-point values.
3637 ///
3638 /// \headerfile <x86intrin.h>
3639 ///
3640 /// This intrinsic is a utility function and does not correspond to a specific
3641 /// instruction.
3642 ///
3643 /// \param __a
3644 /// A single-precision floating-point value used to initialize bits [255:224]
3645 /// of the result.
3646 /// \param __b
3647 /// A single-precision floating-point value used to initialize bits [223:192]
3648 /// of the result.
3649 /// \param __c
3650 /// A single-precision floating-point value used to initialize bits [191:160]
3651 /// of the result.
3652 /// \param __d
3653 /// A single-precision floating-point value used to initialize bits [159:128]
3654 /// of the result.
3655 /// \param __e
3656 /// A single-precision floating-point value used to initialize bits [127:96]
3657 /// of the result.
3658 /// \param __f
3659 /// A single-precision floating-point value used to initialize bits [95:64]
3660 /// of the result.
3661 /// \param __g
3662 /// A single-precision floating-point value used to initialize bits [63:32]
3663 /// of the result.
3664 /// \param __h
3665 /// A single-precision floating-point value used to initialize bits [31:0]
3666 /// of the result.
3667 /// \returns An initialized 256-bit floating-point vector of [8 x float].
3668 static __inline __m256 __DEFAULT_FN_ATTRS
3669 _mm256_set_ps(float __a, float __b, float __c, float __d,
3670  float __e, float __f, float __g, float __h)
3671 {
3672  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3673 }
3674 
3675 /// Constructs a 256-bit integer vector initialized with the specified
3676 /// 32-bit integral values.
3677 ///
3678 /// \headerfile <x86intrin.h>
3679 ///
3680 /// This intrinsic is a utility function and does not correspond to a specific
3681 /// instruction.
3682 ///
3683 /// \param __i0
3684 /// A 32-bit integral value used to initialize bits [255:224] of the result.
3685 /// \param __i1
3686 /// A 32-bit integral value used to initialize bits [223:192] of the result.
3687 /// \param __i2
3688 /// A 32-bit integral value used to initialize bits [191:160] of the result.
3689 /// \param __i3
3690 /// A 32-bit integral value used to initialize bits [159:128] of the result.
3691 /// \param __i4
3692 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3693 /// \param __i5
3694 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3695 /// \param __i6
3696 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3697 /// \param __i7
3698 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3699 /// \returns An initialized 256-bit integer vector.
3700 static __inline __m256i __DEFAULT_FN_ATTRS
3701 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3702  int __i4, int __i5, int __i6, int __i7)
3703 {
3704  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3705 }
3706 
3707 /// Constructs a 256-bit integer vector initialized with the specified
3708 /// 16-bit integral values.
3709 ///
3710 /// \headerfile <x86intrin.h>
3711 ///
3712 /// This intrinsic is a utility function and does not correspond to a specific
3713 /// instruction.
3714 ///
3715 /// \param __w15
3716 /// A 16-bit integral value used to initialize bits [255:240] of the result.
3717 /// \param __w14
3718 /// A 16-bit integral value used to initialize bits [239:224] of the result.
3719 /// \param __w13
3720 /// A 16-bit integral value used to initialize bits [223:208] of the result.
3721 /// \param __w12
3722 /// A 16-bit integral value used to initialize bits [207:192] of the result.
3723 /// \param __w11
3724 /// A 16-bit integral value used to initialize bits [191:176] of the result.
3725 /// \param __w10
3726 /// A 16-bit integral value used to initialize bits [175:160] of the result.
3727 /// \param __w09
3728 /// A 16-bit integral value used to initialize bits [159:144] of the result.
3729 /// \param __w08
3730 /// A 16-bit integral value used to initialize bits [143:128] of the result.
3731 /// \param __w07
3732 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3733 /// \param __w06
3734 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3735 /// \param __w05
3736 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3737 /// \param __w04
3738 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3739 /// \param __w03
3740 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3741 /// \param __w02
3742 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3743 /// \param __w01
3744 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3745 /// \param __w00
3746 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3747 /// \returns An initialized 256-bit integer vector.
3748 static __inline __m256i __DEFAULT_FN_ATTRS
3749 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3750  short __w11, short __w10, short __w09, short __w08,
3751  short __w07, short __w06, short __w05, short __w04,
3752  short __w03, short __w02, short __w01, short __w00)
3753 {
3754  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3755  __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3756 }
3757 
3758 /// Constructs a 256-bit integer vector initialized with the specified
3759 /// 8-bit integral values.
3760 ///
3761 /// \headerfile <x86intrin.h>
3762 ///
3763 /// This intrinsic is a utility function and does not correspond to a specific
3764 /// instruction.
3765 ///
3766 /// \param __b31
3767 /// An 8-bit integral value used to initialize bits [255:248] of the result.
3768 /// \param __b30
3769 /// An 8-bit integral value used to initialize bits [247:240] of the result.
3770 /// \param __b29
3771 /// An 8-bit integral value used to initialize bits [239:232] of the result.
3772 /// \param __b28
3773 /// An 8-bit integral value used to initialize bits [231:224] of the result.
3774 /// \param __b27
3775 /// An 8-bit integral value used to initialize bits [223:216] of the result.
3776 /// \param __b26
3777 /// An 8-bit integral value used to initialize bits [215:208] of the result.
3778 /// \param __b25
3779 /// An 8-bit integral value used to initialize bits [207:200] of the result.
3780 /// \param __b24
3781 /// An 8-bit integral value used to initialize bits [199:192] of the result.
3782 /// \param __b23
3783 /// An 8-bit integral value used to initialize bits [191:184] of the result.
3784 /// \param __b22
3785 /// An 8-bit integral value used to initialize bits [183:176] of the result.
3786 /// \param __b21
3787 /// An 8-bit integral value used to initialize bits [175:168] of the result.
3788 /// \param __b20
3789 /// An 8-bit integral value used to initialize bits [167:160] of the result.
3790 /// \param __b19
3791 /// An 8-bit integral value used to initialize bits [159:152] of the result.
3792 /// \param __b18
3793 /// An 8-bit integral value used to initialize bits [151:144] of the result.
3794 /// \param __b17
3795 /// An 8-bit integral value used to initialize bits [143:136] of the result.
3796 /// \param __b16
3797 /// An 8-bit integral value used to initialize bits [135:128] of the result.
3798 /// \param __b15
3799 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3800 /// \param __b14
3801 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3802 /// \param __b13
3803 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3804 /// \param __b12
3805 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3806 /// \param __b11
3807 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3808 /// \param __b10
3809 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3810 /// \param __b09
3811 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3812 /// \param __b08
3813 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3814 /// \param __b07
3815 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3816 /// \param __b06
3817 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3818 /// \param __b05
3819 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3820 /// \param __b04
3821 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3822 /// \param __b03
3823 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3824 /// \param __b02
3825 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3826 /// \param __b01
3827 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3828 /// \param __b00
3829 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3830 /// \returns An initialized 256-bit integer vector.
3831 static __inline __m256i __DEFAULT_FN_ATTRS
3832 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3833  char __b27, char __b26, char __b25, char __b24,
3834  char __b23, char __b22, char __b21, char __b20,
3835  char __b19, char __b18, char __b17, char __b16,
3836  char __b15, char __b14, char __b13, char __b12,
3837  char __b11, char __b10, char __b09, char __b08,
3838  char __b07, char __b06, char __b05, char __b04,
3839  char __b03, char __b02, char __b01, char __b00)
3840 {
3841  return __extension__ (__m256i)(__v32qi){
3842  __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3843  __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3844  __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3845  __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3846  };
3847 }
3848 
3849 /// Constructs a 256-bit integer vector initialized with the specified
3850 /// 64-bit integral values.
3851 ///
3852 /// \headerfile <x86intrin.h>
3853 ///
3854 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3855 /// instruction.
3856 ///
3857 /// \param __a
3858 /// A 64-bit integral value used to initialize bits [255:192] of the result.
3859 /// \param __b
3860 /// A 64-bit integral value used to initialize bits [191:128] of the result.
3861 /// \param __c
3862 /// A 64-bit integral value used to initialize bits [127:64] of the result.
3863 /// \param __d
3864 /// A 64-bit integral value used to initialize bits [63:0] of the result.
3865 /// \returns An initialized 256-bit integer vector.
3866 static __inline __m256i __DEFAULT_FN_ATTRS
3867 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3868 {
3869  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3870 }
3871 
3872 /* Create vectors with elements in reverse order */
3873 /// Constructs a 256-bit floating-point vector of [4 x double],
3874 /// initialized in reverse order with the specified double-precision
3875 /// floating-point values.
3876 ///
3877 /// \headerfile <x86intrin.h>
3878 ///
3879 /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3880 /// instruction.
3881 ///
3882 /// \param __a
3883 /// A double-precision floating-point value used to initialize bits [63:0]
3884 /// of the result.
3885 /// \param __b
3886 /// A double-precision floating-point value used to initialize bits [127:64]
3887 /// of the result.
3888 /// \param __c
3889 /// A double-precision floating-point value used to initialize bits [191:128]
3890 /// of the result.
3891 /// \param __d
3892 /// A double-precision floating-point value used to initialize bits [255:192]
3893 /// of the result.
3894 /// \returns An initialized 256-bit floating-point vector of [4 x double].
3895 static __inline __m256d __DEFAULT_FN_ATTRS
3896 _mm256_setr_pd(double __a, double __b, double __c, double __d)
3897 {
3898  return _mm256_set_pd(__d, __c, __b, __a);
3899 }
3900 
3901 /// Constructs a 256-bit floating-point vector of [8 x float],
3902 /// initialized in reverse order with the specified single-precision
3903 /// float-point values.
3904 ///
3905 /// \headerfile <x86intrin.h>
3906 ///
3907 /// This intrinsic is a utility function and does not correspond to a specific
3908 /// instruction.
3909 ///
3910 /// \param __a
3911 /// A single-precision floating-point value used to initialize bits [31:0]
3912 /// of the result.
3913 /// \param __b
3914 /// A single-precision floating-point value used to initialize bits [63:32]
3915 /// of the result.
3916 /// \param __c
3917 /// A single-precision floating-point value used to initialize bits [95:64]
3918 /// of the result.
3919 /// \param __d
3920 /// A single-precision floating-point value used to initialize bits [127:96]
3921 /// of the result.
3922 /// \param __e
3923 /// A single-precision floating-point value used to initialize bits [159:128]
3924 /// of the result.
3925 /// \param __f
3926 /// A single-precision floating-point value used to initialize bits [191:160]
3927 /// of the result.
3928 /// \param __g
3929 /// A single-precision floating-point value used to initialize bits [223:192]
3930 /// of the result.
3931 /// \param __h
3932 /// A single-precision floating-point value used to initialize bits [255:224]
3933 /// of the result.
3934 /// \returns An initialized 256-bit floating-point vector of [8 x float].
3935 static __inline __m256 __DEFAULT_FN_ATTRS
3936 _mm256_setr_ps(float __a, float __b, float __c, float __d,
3937  float __e, float __f, float __g, float __h)
3938 {
3939  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3940 }
3941 
3942 /// Constructs a 256-bit integer vector, initialized in reverse order
3943 /// with the specified 32-bit integral values.
3944 ///
3945 /// \headerfile <x86intrin.h>
3946 ///
3947 /// This intrinsic is a utility function and does not correspond to a specific
3948 /// instruction.
3949 ///
3950 /// \param __i0
3951 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3952 /// \param __i1
3953 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3954 /// \param __i2
3955 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3956 /// \param __i3
3957 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3958 /// \param __i4
3959 /// A 32-bit integral value used to initialize bits [159:128] of the result.
3960 /// \param __i5
3961 /// A 32-bit integral value used to initialize bits [191:160] of the result.
3962 /// \param __i6
3963 /// A 32-bit integral value used to initialize bits [223:192] of the result.
3964 /// \param __i7
3965 /// A 32-bit integral value used to initialize bits [255:224] of the result.
3966 /// \returns An initialized 256-bit integer vector.
3967 static __inline __m256i __DEFAULT_FN_ATTRS
3968 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
3969  int __i4, int __i5, int __i6, int __i7)
3970 {
3971  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
3972 }
3973 
3974 /// Constructs a 256-bit integer vector, initialized in reverse order
3975 /// with the specified 16-bit integral values.
3976 ///
3977 /// \headerfile <x86intrin.h>
3978 ///
3979 /// This intrinsic is a utility function and does not correspond to a specific
3980 /// instruction.
3981 ///
3982 /// \param __w15
3983 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3984 /// \param __w14
3985 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3986 /// \param __w13
3987 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3988 /// \param __w12
3989 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3990 /// \param __w11
3991 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3992 /// \param __w10
3993 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3994 /// \param __w09
3995 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3996 /// \param __w08
3997 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3998 /// \param __w07
3999 /// A 16-bit integral value used to initialize bits [143:128] of the result.
4000 /// \param __w06
4001 /// A 16-bit integral value used to initialize bits [159:144] of the result.
4002 /// \param __w05
4003 /// A 16-bit integral value used to initialize bits [175:160] of the result.
4004 /// \param __w04
4005 /// A 16-bit integral value used to initialize bits [191:176] of the result.
4006 /// \param __w03
4007 /// A 16-bit integral value used to initialize bits [207:192] of the result.
4008 /// \param __w02
4009 /// A 16-bit integral value used to initialize bits [223:208] of the result.
4010 /// \param __w01
4011 /// A 16-bit integral value used to initialize bits [239:224] of the result.
4012 /// \param __w00
4013 /// A 16-bit integral value used to initialize bits [255:240] of the result.
4014 /// \returns An initialized 256-bit integer vector.
4015 static __inline __m256i __DEFAULT_FN_ATTRS
4016 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4017  short __w11, short __w10, short __w09, short __w08,
4018  short __w07, short __w06, short __w05, short __w04,
4019  short __w03, short __w02, short __w01, short __w00)
4020 {
4021  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4022  __w04, __w05, __w06, __w07,
4023  __w08, __w09, __w10, __w11,
4024  __w12, __w13, __w14, __w15);
4025 }
4026 
4027 /// Constructs a 256-bit integer vector, initialized in reverse order
4028 /// with the specified 8-bit integral values.
4029 ///
4030 /// \headerfile <x86intrin.h>
4031 ///
4032 /// This intrinsic is a utility function and does not correspond to a specific
4033 /// instruction.
4034 ///
4035 /// \param __b31
4036 /// An 8-bit integral value used to initialize bits [7:0] of the result.
4037 /// \param __b30
4038 /// An 8-bit integral value used to initialize bits [15:8] of the result.
4039 /// \param __b29
4040 /// An 8-bit integral value used to initialize bits [23:16] of the result.
4041 /// \param __b28
4042 /// An 8-bit integral value used to initialize bits [31:24] of the result.
4043 /// \param __b27
4044 /// An 8-bit integral value used to initialize bits [39:32] of the result.
4045 /// \param __b26
4046 /// An 8-bit integral value used to initialize bits [47:40] of the result.
4047 /// \param __b25
4048 /// An 8-bit integral value used to initialize bits [55:48] of the result.
4049 /// \param __b24
4050 /// An 8-bit integral value used to initialize bits [63:56] of the result.
4051 /// \param __b23
4052 /// An 8-bit integral value used to initialize bits [71:64] of the result.
4053 /// \param __b22
4054 /// An 8-bit integral value used to initialize bits [79:72] of the result.
4055 /// \param __b21
4056 /// An 8-bit integral value used to initialize bits [87:80] of the result.
4057 /// \param __b20
4058 /// An 8-bit integral value used to initialize bits [95:88] of the result.
4059 /// \param __b19
4060 /// An 8-bit integral value used to initialize bits [103:96] of the result.
4061 /// \param __b18
4062 /// An 8-bit integral value used to initialize bits [111:104] of the result.
4063 /// \param __b17
4064 /// An 8-bit integral value used to initialize bits [119:112] of the result.
4065 /// \param __b16
4066 /// An 8-bit integral value used to initialize bits [127:120] of the result.
4067 /// \param __b15
4068 /// An 8-bit integral value used to initialize bits [135:128] of the result.
4069 /// \param __b14
4070 /// An 8-bit integral value used to initialize bits [143:136] of the result.
4071 /// \param __b13
4072 /// An 8-bit integral value used to initialize bits [151:144] of the result.
4073 /// \param __b12
4074 /// An 8-bit integral value used to initialize bits [159:152] of the result.
4075 /// \param __b11
4076 /// An 8-bit integral value used to initialize bits [167:160] of the result.
4077 /// \param __b10
4078 /// An 8-bit integral value used to initialize bits [175:168] of the result.
4079 /// \param __b09
4080 /// An 8-bit integral value used to initialize bits [183:176] of the result.
4081 /// \param __b08
4082 /// An 8-bit integral value used to initialize bits [191:184] of the result.
4083 /// \param __b07
4084 /// An 8-bit integral value used to initialize bits [199:192] of the result.
4085 /// \param __b06
4086 /// An 8-bit integral value used to initialize bits [207:200] of the result.
4087 /// \param __b05
4088 /// An 8-bit integral value used to initialize bits [215:208] of the result.
4089 /// \param __b04
4090 /// An 8-bit integral value used to initialize bits [223:216] of the result.
4091 /// \param __b03
4092 /// An 8-bit integral value used to initialize bits [231:224] of the result.
4093 /// \param __b02
4094 /// An 8-bit integral value used to initialize bits [239:232] of the result.
4095 /// \param __b01
4096 /// An 8-bit integral value used to initialize bits [247:240] of the result.
4097 /// \param __b00
4098 /// An 8-bit integral value used to initialize bits [255:248] of the result.
4099 /// \returns An initialized 256-bit integer vector.
4100 static __inline __m256i __DEFAULT_FN_ATTRS
4101 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4102  char __b27, char __b26, char __b25, char __b24,
4103  char __b23, char __b22, char __b21, char __b20,
4104  char __b19, char __b18, char __b17, char __b16,
4105  char __b15, char __b14, char __b13, char __b12,
4106  char __b11, char __b10, char __b09, char __b08,
4107  char __b07, char __b06, char __b05, char __b04,
4108  char __b03, char __b02, char __b01, char __b00)
4109 {
4110  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4111  __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4112  __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4113  __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4114 }
4115 
4116 /// Constructs a 256-bit integer vector, initialized in reverse order
4117 /// with the specified 64-bit integral values.
4118 ///
4119 /// \headerfile <x86intrin.h>
4120 ///
4121 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4122 /// instruction.
4123 ///
4124 /// \param __a
4125 /// A 64-bit integral value used to initialize bits [63:0] of the result.
4126 /// \param __b
4127 /// A 64-bit integral value used to initialize bits [127:64] of the result.
4128 /// \param __c
4129 /// A 64-bit integral value used to initialize bits [191:128] of the result.
4130 /// \param __d
4131 /// A 64-bit integral value used to initialize bits [255:192] of the result.
4132 /// \returns An initialized 256-bit integer vector.
4133 static __inline __m256i __DEFAULT_FN_ATTRS
4134 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4135 {
4136  return _mm256_set_epi64x(__d, __c, __b, __a);
4137 }
4138 
4139 /* Create vectors with repeated elements */
4140 /// Constructs a 256-bit floating-point vector of [4 x double], with each
4141 /// of the four double-precision floating-point vector elements set to the
4142 /// specified double-precision floating-point value.
4143 ///
4144 /// \headerfile <x86intrin.h>
4145 ///
4146 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4147 ///
4148 /// \param __w
4149 /// A double-precision floating-point value used to initialize each vector
4150 /// element of the result.
4151 /// \returns An initialized 256-bit floating-point vector of [4 x double].
4152 static __inline __m256d __DEFAULT_FN_ATTRS
4153 _mm256_set1_pd(double __w)
4154 {
4155  return _mm256_set_pd(__w, __w, __w, __w);
4156 }
4157 
4158 /// Constructs a 256-bit floating-point vector of [8 x float], with each
4159 /// of the eight single-precision floating-point vector elements set to the
4160 /// specified single-precision floating-point value.
4161 ///
4162 /// \headerfile <x86intrin.h>
4163 ///
4164 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4165 /// instruction.
4166 ///
4167 /// \param __w
4168 /// A single-precision floating-point value used to initialize each vector
4169 /// element of the result.
4170 /// \returns An initialized 256-bit floating-point vector of [8 x float].
4171 static __inline __m256 __DEFAULT_FN_ATTRS
4172 _mm256_set1_ps(float __w)
4173 {
4174  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4175 }
4176 
4177 /// Constructs a 256-bit integer vector of [8 x i32], with each of the
4178 /// 32-bit integral vector elements set to the specified 32-bit integral
4179 /// value.
4180 ///
4181 /// \headerfile <x86intrin.h>
4182 ///
4183 /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4184 /// instruction.
4185 ///
4186 /// \param __i
4187 /// A 32-bit integral value used to initialize each vector element of the
4188 /// result.
4189 /// \returns An initialized 256-bit integer vector of [8 x i32].
4190 static __inline __m256i __DEFAULT_FN_ATTRS
4192 {
4193  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4194 }
4195 
4196 /// Constructs a 256-bit integer vector of [16 x i16], with each of the
4197 /// 16-bit integral vector elements set to the specified 16-bit integral
4198 /// value.
4199 ///
4200 /// \headerfile <x86intrin.h>
4201 ///
4202 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4203 ///
4204 /// \param __w
4205 /// A 16-bit integral value used to initialize each vector element of the
4206 /// result.
4207 /// \returns An initialized 256-bit integer vector of [16 x i16].
4208 static __inline __m256i __DEFAULT_FN_ATTRS
4210 {
4211  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4212  __w, __w, __w, __w, __w, __w, __w, __w);
4213 }
4214 
4215 /// Constructs a 256-bit integer vector of [32 x i8], with each of the
4216 /// 8-bit integral vector elements set to the specified 8-bit integral value.
4217 ///
4218 /// \headerfile <x86intrin.h>
4219 ///
4220 /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4221 ///
4222 /// \param __b
4223 /// An 8-bit integral value used to initialize each vector element of the
4224 /// result.
4225 /// \returns An initialized 256-bit integer vector of [32 x i8].
4226 static __inline __m256i __DEFAULT_FN_ATTRS
4228 {
4229  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4230  __b, __b, __b, __b, __b, __b, __b, __b,
4231  __b, __b, __b, __b, __b, __b, __b, __b,
4232  __b, __b, __b, __b, __b, __b, __b, __b);
4233 }
4234 
4235 /// Constructs a 256-bit integer vector of [4 x i64], with each of the
4236 /// 64-bit integral vector elements set to the specified 64-bit integral
4237 /// value.
4238 ///
4239 /// \headerfile <x86intrin.h>
4240 ///
4241 /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4242 ///
4243 /// \param __q
4244 /// A 64-bit integral value used to initialize each vector element of the
4245 /// result.
4246 /// \returns An initialized 256-bit integer vector of [4 x i64].
4247 static __inline __m256i __DEFAULT_FN_ATTRS
4248 _mm256_set1_epi64x(long long __q)
4249 {
4250  return _mm256_set_epi64x(__q, __q, __q, __q);
4251 }
4252 
4253 /* Create __zeroed vectors */
4254 /// Constructs a 256-bit floating-point vector of [4 x double] with all
4255 /// vector elements initialized to zero.
4256 ///
4257 /// \headerfile <x86intrin.h>
4258 ///
4259 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4260 ///
4261 /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4262 static __inline __m256d __DEFAULT_FN_ATTRS
4264 {
4265  return __extension__ (__m256d){ 0, 0, 0, 0 };
4266 }
4267 
4268 /// Constructs a 256-bit floating-point vector of [8 x float] with all
4269 /// vector elements initialized to zero.
4270 ///
4271 /// \headerfile <x86intrin.h>
4272 ///
4273 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4274 ///
4275 /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4276 static __inline __m256 __DEFAULT_FN_ATTRS
4278 {
4279  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
4280 }
4281 
4282 /// Constructs a 256-bit integer vector initialized to zero.
4283 ///
4284 /// \headerfile <x86intrin.h>
4285 ///
4286 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4287 ///
4288 /// \returns A 256-bit integer vector initialized to zero.
4289 static __inline __m256i __DEFAULT_FN_ATTRS
4291 {
4292  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4293 }
4294 
4295 /* Cast between vector types */
4296 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4297 /// floating-point vector of [8 x float].
4298 ///
4299 /// \headerfile <x86intrin.h>
4300 ///
4301 /// This intrinsic has no corresponding instruction.
4302 ///
4303 /// \param __a
4304 /// A 256-bit floating-point vector of [4 x double].
4305 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
4306 /// bitwise pattern as the parameter.
4307 static __inline __m256 __DEFAULT_FN_ATTRS
4308 _mm256_castpd_ps(__m256d __a)
4309 {
4310  return (__m256)__a;
4311 }
4312 
4313 /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4314 /// integer vector.
4315 ///
4316 /// \headerfile <x86intrin.h>
4317 ///
4318 /// This intrinsic has no corresponding instruction.
4319 ///
4320 /// \param __a
4321 /// A 256-bit floating-point vector of [4 x double].
4322 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
4323 /// parameter.
4324 static __inline __m256i __DEFAULT_FN_ATTRS
4326 {
4327  return (__m256i)__a;
4328 }
4329 
4330 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4331 /// floating-point vector of [4 x double].
4332 ///
4333 /// \headerfile <x86intrin.h>
4334 ///
4335 /// This intrinsic has no corresponding instruction.
4336 ///
4337 /// \param __a
4338 /// A 256-bit floating-point vector of [8 x float].
4339 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
4340 /// bitwise pattern as the parameter.
4341 static __inline __m256d __DEFAULT_FN_ATTRS
4342 _mm256_castps_pd(__m256 __a)
4343 {
4344  return (__m256d)__a;
4345 }
4346 
4347 /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4348 /// integer vector.
4349 ///
4350 /// \headerfile <x86intrin.h>
4351 ///
4352 /// This intrinsic has no corresponding instruction.
4353 ///
4354 /// \param __a
4355 /// A 256-bit floating-point vector of [8 x float].
4356 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
4357 /// parameter.
4358 static __inline __m256i __DEFAULT_FN_ATTRS
4360 {
4361  return (__m256i)__a;
4362 }
4363 
4364 /// Casts a 256-bit integer vector into a 256-bit floating-point vector
4365 /// of [8 x float].
4366 ///
4367 /// \headerfile <x86intrin.h>
4368 ///
4369 /// This intrinsic has no corresponding instruction.
4370 ///
4371 /// \param __a
4372 /// A 256-bit integer vector.
4373 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
4374 /// bitwise pattern as the parameter.
4375 static __inline __m256 __DEFAULT_FN_ATTRS
4377 {
4378  return (__m256)__a;
4379 }
4380 
4381 /// Casts a 256-bit integer vector into a 256-bit floating-point vector
4382 /// of [4 x double].
4383 ///
4384 /// \headerfile <x86intrin.h>
4385 ///
4386 /// This intrinsic has no corresponding instruction.
4387 ///
4388 /// \param __a
4389 /// A 256-bit integer vector.
4390 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
4391 /// bitwise pattern as the parameter.
4392 static __inline __m256d __DEFAULT_FN_ATTRS
4394 {
4395  return (__m256d)__a;
4396 }
4397 
4398 /// Returns the lower 128 bits of a 256-bit floating-point vector of
4399 /// [4 x double] as a 128-bit floating-point vector of [2 x double].
4400 ///
4401 /// \headerfile <x86intrin.h>
4402 ///
4403 /// This intrinsic has no corresponding instruction.
4404 ///
4405 /// \param __a
4406 /// A 256-bit floating-point vector of [4 x double].
4407 /// \returns A 128-bit floating-point vector of [2 x double] containing the
4408 /// lower 128 bits of the parameter.
4409 static __inline __m128d __DEFAULT_FN_ATTRS
4411 {
4412  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4413 }
4414 
4415 /// Returns the lower 128 bits of a 256-bit floating-point vector of
4416 /// [8 x float] as a 128-bit floating-point vector of [4 x float].
4417 ///
4418 /// \headerfile <x86intrin.h>
4419 ///
4420 /// This intrinsic has no corresponding instruction.
4421 ///
4422 /// \param __a
4423 /// A 256-bit floating-point vector of [8 x float].
4424 /// \returns A 128-bit floating-point vector of [4 x float] containing the
4425 /// lower 128 bits of the parameter.
4426 static __inline __m128 __DEFAULT_FN_ATTRS
4428 {
4429  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4430 }
4431 
4432 /// Truncates a 256-bit integer vector into a 128-bit integer vector.
4433 ///
4434 /// \headerfile <x86intrin.h>
4435 ///
4436 /// This intrinsic has no corresponding instruction.
4437 ///
4438 /// \param __a
4439 /// A 256-bit integer vector.
4440 /// \returns A 128-bit integer vector containing the lower 128 bits of the
4441 /// parameter.
4442 static __inline __m128i __DEFAULT_FN_ATTRS
4444 {
4445  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4446 }
4447 
4448 /// Constructs a 256-bit floating-point vector of [4 x double] from a
4449 /// 128-bit floating-point vector of [2 x double].
4450 ///
4451 /// The lower 128 bits contain the value of the source vector. The contents
4452 /// of the upper 128 bits are undefined.
4453 ///
4454 /// \headerfile <x86intrin.h>
4455 ///
4456 /// This intrinsic has no corresponding instruction.
4457 ///
4458 /// \param __a
4459 /// A 128-bit vector of [2 x double].
4460 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4461 /// contain the value of the parameter. The contents of the upper 128 bits
4462 /// are undefined.
4463 static __inline __m256d __DEFAULT_FN_ATTRS
4465 {
4466  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
4467 }
4468 
4469 /// Constructs a 256-bit floating-point vector of [8 x float] from a
4470 /// 128-bit floating-point vector of [4 x float].
4471 ///
4472 /// The lower 128 bits contain the value of the source vector. The contents
4473 /// of the upper 128 bits are undefined.
4474 ///
4475 /// \headerfile <x86intrin.h>
4476 ///
4477 /// This intrinsic has no corresponding instruction.
4478 ///
4479 /// \param __a
4480 /// A 128-bit vector of [4 x float].
4481 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4482 /// contain the value of the parameter. The contents of the upper 128 bits
4483 /// are undefined.
4484 static __inline __m256 __DEFAULT_FN_ATTRS
4486 {
4487  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
4488 }
4489 
4490 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
4491 ///
4492 /// The lower 128 bits contain the value of the source vector. The contents
4493 /// of the upper 128 bits are undefined.
4494 ///
4495 /// \headerfile <x86intrin.h>
4496 ///
4497 /// This intrinsic has no corresponding instruction.
4498 ///
4499 /// \param __a
4500 /// A 128-bit integer vector.
4501 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4502 /// the parameter. The contents of the upper 128 bits are undefined.
4503 static __inline __m256i __DEFAULT_FN_ATTRS
4505 {
4506  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
4507 }
4508 
4509 /// Constructs a 256-bit floating-point vector of [4 x double] from a
4510 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4511 /// contain the value of the source vector. The upper 128 bits are set
4512 /// to zero.
4513 ///
4514 /// \headerfile <x86intrin.h>
4515 ///
4516 /// This intrinsic has no corresponding instruction.
4517 ///
4518 /// \param __a
4519 /// A 128-bit vector of [2 x double].
4520 /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4521 /// contain the value of the parameter. The upper 128 bits are set to zero.
4522 static __inline __m256d __DEFAULT_FN_ATTRS
4524 {
4525  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4526 }
4527 
4528 /// Constructs a 256-bit floating-point vector of [8 x float] from a
4529 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4530 /// the value of the source vector. The upper 128 bits are set to zero.
4531 ///
4532 /// \headerfile <x86intrin.h>
4533 ///
4534 /// This intrinsic has no corresponding instruction.
4535 ///
4536 /// \param __a
4537 /// A 128-bit vector of [4 x float].
4538 /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4539 /// contain the value of the parameter. The upper 128 bits are set to zero.
4540 static __inline __m256 __DEFAULT_FN_ATTRS
4542 {
4543  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4544 }
4545 
4546 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
4547 /// The lower 128 bits contain the value of the source vector. The upper
4548 /// 128 bits are set to zero.
4549 ///
4550 /// \headerfile <x86intrin.h>
4551 ///
4552 /// This intrinsic has no corresponding instruction.
4553 ///
4554 /// \param __a
4555 /// A 128-bit integer vector.
4556 /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4557 /// the parameter. The upper 128 bits are set to zero.
4558 static __inline __m256i __DEFAULT_FN_ATTRS
4560 {
4561  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4562 }
4563 
4564 /*
4565  Vector insert.
4566  We use macros rather than inlines because we only want to accept
4567  invocations where the immediate M is a constant expression.
4568 */
4569 /// Constructs a new 256-bit vector of [8 x float] by first duplicating
4570 /// a 256-bit vector of [8 x float] given in the first parameter, and then
4571 /// replacing either the upper or the lower 128 bits with the contents of a
4572 /// 128-bit vector of [4 x float] in the second parameter.
4573 ///
4574 /// The immediate integer parameter determines between the upper or the lower
4575 /// 128 bits.
4576 ///
4577 /// \headerfile <x86intrin.h>
4578 ///
4579 /// \code
4580 /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4581 /// \endcode
4582 ///
4583 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4584 ///
4585 /// \param V1
4586 /// A 256-bit vector of [8 x float]. This vector is copied to the result
4587 /// first, and then either the upper or the lower 128 bits of the result will
4588 /// be replaced by the contents of \a V2.
4589 /// \param V2
4590 /// A 128-bit vector of [4 x float]. The contents of this parameter are
4591 /// written to either the upper or the lower 128 bits of the result depending
4592 /// on the value of parameter \a M.
4593 /// \param M
4594 /// An immediate integer. The least significant bit determines how the values
4595 /// from the two parameters are interleaved: \n
4596 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4597 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4598 /// result. \n
4599 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4600 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4601 /// result.
4602 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4603 #define _mm256_insertf128_ps(V1, V2, M) \
4604  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4605  (__v4sf)(__m128)(V2), (int)(M))
4606 
4607 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
4608 /// a 256-bit vector of [4 x double] given in the first parameter, and then
4609 /// replacing either the upper or the lower 128 bits with the contents of a
4610 /// 128-bit vector of [2 x double] in the second parameter.
4611 ///
4612 /// The immediate integer parameter determines between the upper or the lower
4613 /// 128 bits.
4614 ///
4615 /// \headerfile <x86intrin.h>
4616 ///
4617 /// \code
4618 /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4619 /// \endcode
4620 ///
4621 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4622 ///
4623 /// \param V1
4624 /// A 256-bit vector of [4 x double]. This vector is copied to the result
4625 /// first, and then either the upper or the lower 128 bits of the result will
4626 /// be replaced by the contents of \a V2.
4627 /// \param V2
4628 /// A 128-bit vector of [2 x double]. The contents of this parameter are
4629 /// written to either the upper or the lower 128 bits of the result depending
4630 /// on the value of parameter \a M.
4631 /// \param M
4632 /// An immediate integer. The least significant bit determines how the values
4633 /// from the two parameters are interleaved: \n
4634 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4635 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4636 /// result. \n
4637 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4638 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4639 /// result.
4640 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4641 #define _mm256_insertf128_pd(V1, V2, M) \
4642  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4643  (__v2df)(__m128d)(V2), (int)(M))
4644 
4645 /// Constructs a new 256-bit integer vector by first duplicating a
4646 /// 256-bit integer vector given in the first parameter, and then replacing
4647 /// either the upper or the lower 128 bits with the contents of a 128-bit
4648 /// integer vector in the second parameter.
4649 ///
4650 /// The immediate integer parameter determines between the upper or the lower
4651 /// 128 bits.
4652 ///
4653 /// \headerfile <x86intrin.h>
4654 ///
4655 /// \code
4656 /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4657 /// \endcode
4658 ///
4659 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4660 ///
4661 /// \param V1
4662 /// A 256-bit integer vector. This vector is copied to the result first, and
4663 /// then either the upper or the lower 128 bits of the result will be
4664 /// replaced by the contents of \a V2.
4665 /// \param V2
4666 /// A 128-bit integer vector. The contents of this parameter are written to
4667 /// either the upper or the lower 128 bits of the result depending on the
4668 /// value of parameter \a M.
4669 /// \param M
4670 /// An immediate integer. The least significant bit determines how the values
4671 /// from the two parameters are interleaved: \n
4672 /// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4673 /// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4674 /// result. \n
4675 /// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4676 /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4677 /// result.
4678 /// \returns A 256-bit integer vector containing the interleaved values.
4679 #define _mm256_insertf128_si256(V1, V2, M) \
4680  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4681  (__v4si)(__m128i)(V2), (int)(M))
4682 
4683 /*
4684  Vector extract.
4685  We use macros rather than inlines because we only want to accept
4686  invocations where the immediate M is a constant expression.
4687 */
4688 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
4689 /// of [8 x float], as determined by the immediate integer parameter, and
4690 /// returns the extracted bits as a 128-bit vector of [4 x float].
4691 ///
4692 /// \headerfile <x86intrin.h>
4693 ///
4694 /// \code
4695 /// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4696 /// \endcode
4697 ///
4698 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4699 ///
4700 /// \param V
4701 /// A 256-bit vector of [8 x float].
4702 /// \param M
4703 /// An immediate integer. The least significant bit determines which bits are
4704 /// extracted from the first parameter: \n
4705 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4706 /// result. \n
4707 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4708 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4709 #define _mm256_extractf128_ps(V, M) \
4710  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
4711 
4712 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
4713 /// of [4 x double], as determined by the immediate integer parameter, and
4714 /// returns the extracted bits as a 128-bit vector of [2 x double].
4715 ///
4716 /// \headerfile <x86intrin.h>
4717 ///
4718 /// \code
4719 /// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4720 /// \endcode
4721 ///
4722 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4723 ///
4724 /// \param V
4725 /// A 256-bit vector of [4 x double].
4726 /// \param M
4727 /// An immediate integer. The least significant bit determines which bits are
4728 /// extracted from the first parameter: \n
4729 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4730 /// result. \n
4731 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4732 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4733 #define _mm256_extractf128_pd(V, M) \
4734  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
4735 
4736 /// Extracts either the upper or the lower 128 bits from a 256-bit
4737 /// integer vector, as determined by the immediate integer parameter, and
4738 /// returns the extracted bits as a 128-bit integer vector.
4739 ///
4740 /// \headerfile <x86intrin.h>
4741 ///
4742 /// \code
4743 /// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4744 /// \endcode
4745 ///
4746 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4747 ///
4748 /// \param V
4749 /// A 256-bit integer vector.
4750 /// \param M
4751 /// An immediate integer. The least significant bit determines which bits are
4752 /// extracted from the first parameter: \n
4753 /// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4754 /// result. \n
4755 /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4756 /// \returns A 128-bit integer vector containing the extracted bits.
4757 #define _mm256_extractf128_si256(V, M) \
4758  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
4759 
4760 /* SIMD load ops (unaligned) */
4761 /// Loads two 128-bit floating-point vectors of [4 x float] from
4762 /// unaligned memory locations and constructs a 256-bit floating-point vector
4763 /// of [8 x float] by concatenating the two 128-bit vectors.
4764 ///
4765 /// \headerfile <x86intrin.h>
4766 ///
4767 /// This intrinsic corresponds to load instructions followed by the
4768 /// <c> VINSERTF128 </c> instruction.
4769 ///
4770 /// \param __addr_hi
4771 /// A pointer to a 128-bit memory location containing 4 consecutive
4772 /// single-precision floating-point values. These values are to be copied to
4773 /// bits[255:128] of the result. The address of the memory location does not
4774 /// have to be aligned.
4775 /// \param __addr_lo
4776 /// A pointer to a 128-bit memory location containing 4 consecutive
4777 /// single-precision floating-point values. These values are to be copied to
4778 /// bits[127:0] of the result. The address of the memory location does not
4779 /// have to be aligned.
4780 /// \returns A 256-bit floating-point vector of [8 x float] containing the
4781 /// concatenated result.
4782 static __inline __m256 __DEFAULT_FN_ATTRS
4783 _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4784 {
4785  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4786  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4787 }
4788 
4789 /// Loads two 128-bit floating-point vectors of [2 x double] from
4790 /// unaligned memory locations and constructs a 256-bit floating-point vector
4791 /// of [4 x double] by concatenating the two 128-bit vectors.
4792 ///
4793 /// \headerfile <x86intrin.h>
4794 ///
4795 /// This intrinsic corresponds to load instructions followed by the
4796 /// <c> VINSERTF128 </c> instruction.
4797 ///
4798 /// \param __addr_hi
4799 /// A pointer to a 128-bit memory location containing two consecutive
4800 /// double-precision floating-point values. These values are to be copied to
4801 /// bits[255:128] of the result. The address of the memory location does not
4802 /// have to be aligned.
4803 /// \param __addr_lo
4804 /// A pointer to a 128-bit memory location containing two consecutive
4805 /// double-precision floating-point values. These values are to be copied to
4806 /// bits[127:0] of the result. The address of the memory location does not
4807 /// have to be aligned.
4808 /// \returns A 256-bit floating-point vector of [4 x double] containing the
4809 /// concatenated result.
4810 static __inline __m256d __DEFAULT_FN_ATTRS
4811 _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4812 {
4813  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4814  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4815 }
4816 
4817 /// Loads two 128-bit integer vectors from unaligned memory locations and
4818 /// constructs a 256-bit integer vector by concatenating the two 128-bit
4819 /// vectors.
4820 ///
4821 /// \headerfile <x86intrin.h>
4822 ///
4823 /// This intrinsic corresponds to load instructions followed by the
4824 /// <c> VINSERTF128 </c> instruction.
4825 ///
4826 /// \param __addr_hi
4827 /// A pointer to a 128-bit memory location containing a 128-bit integer
4828 /// vector. This vector is to be copied to bits[255:128] of the result. The
4829 /// address of the memory location does not have to be aligned.
4830 /// \param __addr_lo
4831 /// A pointer to a 128-bit memory location containing a 128-bit integer
4832 /// vector. This vector is to be copied to bits[127:0] of the result. The
4833 /// address of the memory location does not have to be aligned.
4834 /// \returns A 256-bit integer vector containing the concatenated result.
4835 static __inline __m256i __DEFAULT_FN_ATTRS
4836 _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
4837 {
4838  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4839  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4840 }
4841 
4842 /* SIMD store ops (unaligned) */
4843 /// Stores the upper and lower 128 bits of a 256-bit floating-point
4844 /// vector of [8 x float] into two different unaligned memory locations.
4845 ///
4846 /// \headerfile <x86intrin.h>
4847 ///
4848 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4849 /// store instructions.
4850 ///
4851 /// \param __addr_hi
4852 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4853 /// copied to this memory location. The address of this memory location does
4854 /// not have to be aligned.
4855 /// \param __addr_lo
4856 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4857 /// copied to this memory location. The address of this memory location does
4858 /// not have to be aligned.
4859 /// \param __a
4860 /// A 256-bit floating-point vector of [8 x float].
4861 static __inline void __DEFAULT_FN_ATTRS
4862 _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
4863 {
4864  __m128 __v128;
4865 
4866  __v128 = _mm256_castps256_ps128(__a);
4867  _mm_storeu_ps(__addr_lo, __v128);
4868  __v128 = _mm256_extractf128_ps(__a, 1);
4869  _mm_storeu_ps(__addr_hi, __v128);
4870 }
4871 
4872 /// Stores the upper and lower 128 bits of a 256-bit floating-point
4873 /// vector of [4 x double] into two different unaligned memory locations.
4874 ///
4875 /// \headerfile <x86intrin.h>
4876 ///
4877 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4878 /// store instructions.
4879 ///
4880 /// \param __addr_hi
4881 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4882 /// copied to this memory location. The address of this memory location does
4883 /// not have to be aligned.
4884 /// \param __addr_lo
4885 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4886 /// copied to this memory location. The address of this memory location does
4887 /// not have to be aligned.
4888 /// \param __a
4889 /// A 256-bit floating-point vector of [4 x double].
4890 static __inline void __DEFAULT_FN_ATTRS
4891 _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
4892 {
4893  __m128d __v128;
4894 
4895  __v128 = _mm256_castpd256_pd128(__a);
4896  _mm_storeu_pd(__addr_lo, __v128);
4897  __v128 = _mm256_extractf128_pd(__a, 1);
4898  _mm_storeu_pd(__addr_hi, __v128);
4899 }
4900 
4901 /// Stores the upper and lower 128 bits of a 256-bit integer vector into
4902 /// two different unaligned memory locations.
4903 ///
4904 /// \headerfile <x86intrin.h>
4905 ///
4906 /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
4907 /// store instructions.
4908 ///
4909 /// \param __addr_hi
4910 /// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
4911 /// copied to this memory location. The address of this memory location does
4912 /// not have to be aligned.
4913 /// \param __addr_lo
4914 /// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
4915 /// copied to this memory location. The address of this memory location does
4916 /// not have to be aligned.
4917 /// \param __a
4918 /// A 256-bit integer vector.
4919 static __inline void __DEFAULT_FN_ATTRS
4920 _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
4921 {
4922  __m128i __v128;
4923 
4924  __v128 = _mm256_castsi256_si128(__a);
4925  _mm_storeu_si128(__addr_lo, __v128);
4926  __v128 = _mm256_extractf128_si256(__a, 1);
4927  _mm_storeu_si128(__addr_hi, __v128);
4928 }
4929 
4930 /// Constructs a 256-bit floating-point vector of [8 x float] by
4931 /// concatenating two 128-bit floating-point vectors of [4 x float].
4932 ///
4933 /// \headerfile <x86intrin.h>
4934 ///
4935 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4936 ///
4937 /// \param __hi
4938 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4939 /// 128 bits of the result.
4940 /// \param __lo
4941 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4942 /// 128 bits of the result.
4943 /// \returns A 256-bit floating-point vector of [8 x float] containing the
4944 /// concatenated result.
4945 static __inline __m256 __DEFAULT_FN_ATTRS
4946 _mm256_set_m128 (__m128 __hi, __m128 __lo)
4947 {
4948  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4949 }
4950 
4951 /// Constructs a 256-bit floating-point vector of [4 x double] by
4952 /// concatenating two 128-bit floating-point vectors of [2 x double].
4953 ///
4954 /// \headerfile <x86intrin.h>
4955 ///
4956 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4957 ///
4958 /// \param __hi
4959 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4960 /// 128 bits of the result.
4961 /// \param __lo
4962 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4963 /// 128 bits of the result.
4964 /// \returns A 256-bit floating-point vector of [4 x double] containing the
4965 /// concatenated result.
4966 static __inline __m256d __DEFAULT_FN_ATTRS
4967 _mm256_set_m128d (__m128d __hi, __m128d __lo)
4968 {
4969  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4970 }
4971 
4972 /// Constructs a 256-bit integer vector by concatenating two 128-bit
4973 /// integer vectors.
4974 ///
4975 /// \headerfile <x86intrin.h>
4976 ///
4977 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4978 ///
4979 /// \param __hi
4980 /// A 128-bit integer vector to be copied to the upper 128 bits of the
4981 /// result.
4982 /// \param __lo
4983 /// A 128-bit integer vector to be copied to the lower 128 bits of the
4984 /// result.
4985 /// \returns A 256-bit integer vector containing the concatenated result.
4986 static __inline __m256i __DEFAULT_FN_ATTRS
4987 _mm256_set_m128i (__m128i __hi, __m128i __lo)
4988 {
4989  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
4990 }
4991 
4992 /// Constructs a 256-bit floating-point vector of [8 x float] by
4993 /// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4994 /// similar to _mm256_set_m128, but the order of the input parameters is
4995 /// swapped.
4996 ///
4997 /// \headerfile <x86intrin.h>
4998 ///
4999 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5000 ///
5001 /// \param __lo
5002 /// A 128-bit floating-point vector of [4 x float] to be copied to the lower
5003 /// 128 bits of the result.
5004 /// \param __hi
5005 /// A 128-bit floating-point vector of [4 x float] to be copied to the upper
5006 /// 128 bits of the result.
5007 /// \returns A 256-bit floating-point vector of [8 x float] containing the
5008 /// concatenated result.
5009 static __inline __m256 __DEFAULT_FN_ATTRS
5010 _mm256_setr_m128 (__m128 __lo, __m128 __hi)
5011 {
5012  return _mm256_set_m128(__hi, __lo);
5013 }
5014 
5015 /// Constructs a 256-bit floating-point vector of [4 x double] by
5016 /// concatenating two 128-bit floating-point vectors of [2 x double]. This is
5017 /// similar to _mm256_set_m128d, but the order of the input parameters is
5018 /// swapped.
5019 ///
5020 /// \headerfile <x86intrin.h>
5021 ///
5022 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5023 ///
5024 /// \param __lo
5025 /// A 128-bit floating-point vector of [2 x double] to be copied to the lower
5026 /// 128 bits of the result.
5027 /// \param __hi
5028 /// A 128-bit floating-point vector of [2 x double] to be copied to the upper
5029 /// 128 bits of the result.
5030 /// \returns A 256-bit floating-point vector of [4 x double] containing the
5031 /// concatenated result.
5032 static __inline __m256d __DEFAULT_FN_ATTRS
5033 _mm256_setr_m128d (__m128d __lo, __m128d __hi)
5034 {
5035  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5036 }
5037 
5038 /// Constructs a 256-bit integer vector by concatenating two 128-bit
5039 /// integer vectors. This is similar to _mm256_set_m128i, but the order of
5040 /// the input parameters is swapped.
5041 ///
5042 /// \headerfile <x86intrin.h>
5043 ///
5044 /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
5045 ///
5046 /// \param __lo
5047 /// A 128-bit integer vector to be copied to the lower 128 bits of the
5048 /// result.
5049 /// \param __hi
5050 /// A 128-bit integer vector to be copied to the upper 128 bits of the
5051 /// result.
5052 /// \returns A 256-bit integer vector containing the concatenated result.
5053 static __inline __m256i __DEFAULT_FN_ATTRS
5054 _mm256_setr_m128i (__m128i __lo, __m128i __hi)
5055 {
5056  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
5057 }
5058 
5059 #undef __DEFAULT_FN_ATTRS
5060 
5061 #endif /* __AVXINTRIN_H */
#define _mm256_extractf128_si256(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit integer vector, as determined by the i...
Definition: avxintrin.h:4757
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:754
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a)
Calculates the reciprocal square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:360
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
Loads two 128-bit integer vectors from unaligned memory locations and constructs a 256-bit integer ve...
Definition: avxintrin.h:4836
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:708
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_undefined_pd(void)
Create a 256-bit vector of [4 x double] with undefined values.
Definition: avxintrin.h:3577
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c)
Copies the values in a 256-bit vector of [4 x double] as specified by the 256-bit integer vector oper...
Definition: avxintrin.h:823
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
Moves double-precision values from a 256-bit vector of [4 x double] to a memory location pointed to b...
Definition: avxintrin.h:3477
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a)
Loads a scalar double-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3001
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4277
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
Loads two 128-bit floating-point vectors of [4 x float] from unaligned memory locations and construct...
Definition: avxintrin.h:4783
static __inline float __DEFAULT_FN_ATTRS _mm256_cvtss_f32(__m256 __a)
Returns the first element of the input vector of [8 x float].
Definition: avxintrin.h:2299
static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2648
static __inline __m128 __DEFAULT_FN_ATTRS _mm_permutevar_ps(__m128 __a, __m128i __c)
Copies the values stored in a 128-bit vector of [4 x float] as specified by the 128-bit integer vecto...
Definition: avxintrin.h:877
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p)
Loads 8 single-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3095
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [8 x float] into two differ...
Definition: avxintrin.h:4862
static __inline int __DEFAULT_FN_ATTRS _mm_testc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2530
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a)
Moves and duplicates even-indexed values from a 256-bit vector of [8 x float] to float values in a 25...
Definition: avxintrin.h:2350
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b)
Moves integer data from a 256-bit integer vector to a 32-byte aligned memory location.
Definition: avxintrin.h:3521
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i(__m128i __hi, __m128i __lo)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors. ...
Definition: avxintrin.h:4987
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:625
#define _mm256_insertf128_ps(V1, V2, M)
Constructs a new 256-bit vector of [8 x float] by first duplicating a 256-bit vector of [8 x float] g...
Definition: avxintrin.h:4603
static __inline void __DEFAULT_FN_ATTRS _mm256_zeroupper(void)
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Definition: avxintrin.h:2963
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a)
Calculates the reciprocals of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:377
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2736
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_zextsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4559
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a)
Extracts the sign bits of single-precision floating point elements in a 256-bit vector of [8 x float]...
Definition: avxintrin.h:2940
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2677
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b)
Constructs a 256-bit integer vector of [32 x i8], with each of the 8-bit integral vector elements set...
Definition: avxintrin.h:4227
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:547
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b)
Multiplies two 256-bit vectors of [8 x float].
Definition: avxintrin.h:309
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p)
Loads 256 bits of integer data from a 32-byte aligned memory location pointed to by __p into elements...
Definition: avxintrin.h:3152
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b)
Unpacks the odd-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves them...
Definition: avxintrin.h:2395
static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to a 32-byte aligned memory location pointed to b...
Definition: avxintrin.h:3286
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2444
#define _mm256_extractf128_ps(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [8 x float], as determined by the immediate integer parameter, and returns the extracted bits as a 128-bit vector of [4 x float].
Definition: avxintrin.h:4709
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: avxintrin.h:4101
#define _mm256_insertf128_si256(V1, V2, M)
Constructs a new 256-bit integer vector by first duplicating a 256-bit integer vector given in the fi...
Definition: avxintrin.h:4679
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d(__m128d __lo, __m128d __hi)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:5033
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: avxintrin.h:4016
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:529
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32].
Definition: avxintrin.h:2234
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector initialized with the specified 64-bit integral values...
Definition: avxintrin.h:3867
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q)
Constructs a 256-bit integer vector of [4 x i64], with each of the 64-bit integral vector elements se...
Definition: avxintrin.h:4248
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b)
Horizontally subtracts the adjacent pairs of values contained in two 256-bit vectors of [4 x double]...
Definition: avxintrin.h:731
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a)
Calculates the square roots of the values in a 256-bit vector of [4 x double].
Definition: avxintrin.h:326
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [4 x double].
Definition: avxintrin.h:4393
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00)
Constructs a 256-bit integer vector initialized with the specified 16-bit integral values...
Definition: avxintrin.h:3749
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b)
Subtracts two 256-bit vectors of [4 x double].
Definition: avxintrin.h:105
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b)
Horizontally adds the adjacent pairs of values contained in two 256-bit vectors of [4 x double]...
Definition: avxintrin.h:685
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b)
Subtracts two 256-bit vectors of [8 x float].
Definition: avxintrin.h:123
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
Stores the upper and lower 128 bits of a 256-bit integer vector into two different unaligned memory l...
Definition: avxintrin.h:4920
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1652
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_undefined_si256(void)
Create a 256-bit integer vector with undefined values.
Definition: avxintrin.h:3603
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1987
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d(__m128d __hi, __m128d __lo)
Constructs a 256-bit floating-point vector of [4 x double] by concatenating two 128-bit floating-poin...
Definition: avxintrin.h:4967
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2824
#define _mm256_extractf128_pd(V, M)
Extracts either the upper or the lower 128 bits from a 256-bit vector of [4 x double], as determined by the immediate integer parameter, and returns the extracted bits as a 128-bit vector of [2 x double].
Definition: avxintrin.h:4733
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_zextpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4523
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors...
Definition: avxintrin.h:2850
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w)
Constructs a 256-bit floating-point vector of [8 x float], with each of the eight single-precision fl...
Definition: avxintrin.h:4172
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
Merges 64-bit double-precision data values stored in either of the two 256-bit vectors of [4 x double...
Definition: avxintrin.h:1392
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double] initialized with the specified double-prec...
Definition: avxintrin.h:3630
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x i32], truncating the result b...
Definition: avxintrin.h:2217
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b)
Adds two 256-bit vectors of [8 x float].
Definition: avxintrin.h:87
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [4 x doub...
Definition: avxintrin.h:142
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3524
static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [4 x double] as a 128-bit floating-p...
Definition: avxintrin.h:4410
static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to a 32-byte alig...
Definition: avxintrin.h:3209
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:38
static __inline double __DEFAULT_FN_ATTRS _mm256_cvtsd_f64(__m256d __a)
Returns the first element of the input vector of [4 x double].
Definition: avxintrin.h:2266
static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a memory locatio...
Definition: avxintrin.h:3429
static __inline int __DEFAULT_FN_ATTRS _mm_testz_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2589
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a)
Constructs a 256-bit integer vector from a 128-bit integer vector.
Definition: avxintrin.h:4504
static __inline int __DEFAULT_FN_ATTRS _mm256_cvtsi256_si32(__m256i __a)
Returns the first element of the input vector of [8 x i32].
Definition: avxintrin.h:2282
static __inline __m128d __DEFAULT_FN_ATTRS _mm_maskload_pd(double const *__p, __m128i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3331
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a)
Loads the data from a 128-bit vector of [4 x float] from the specified address pointed to by __a and ...
Definition: avxintrin.h:3059
static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a)
Truncates a 256-bit integer vector into a 128-bit integer vector.
Definition: avxintrin.h:4443
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector initialized with the specified 32-bit integral values...
Definition: avxintrin.h:3701
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors...
Definition: avxintrin.h:2876
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the lesser of each pair of values...
Definition: avxintrin.h:273
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1915
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float] initialized with the specified single-preci...
Definition: avxintrin.h:3669
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2794
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a)
Moves and duplicates double-precision floating point values from a 256-bit vector of [4 x double] to ...
Definition: avxintrin.h:2372
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_zextps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4541
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1880
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void)
Create a 256-bit vector of [8 x float] with undefined values.
Definition: avxintrin.h:3590
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
Loads two 128-bit floating-point vectors of [2 x double] from unaligned memory locations and construc...
Definition: avxintrin.h:4811
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d)
Constructs a 256-bit floating-point vector of [4 x double], initialized in reverse order with the spe...
Definition: avxintrin.h:3896
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i(__m128i __lo, __m128i __hi)
Constructs a 256-bit integer vector by concatenating two 128-bit integer vectors. ...
Definition: avxintrin.h:5054
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a)
Stores double-precision floating point values from a 256-bit vector of [4 x double] to an unaligned m...
Definition: avxintrin.h:3245
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w)
Constructs a 256-bit integer vector of [16 x i16], with each of the 16-bit integral vector elements s...
Definition: avxintrin.h:4209
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3168
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a)
Loads the data from a 128-bit vector of [2 x double] from the specified address pointed to by __a and...
Definition: avxintrin.h:3039
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a)
Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 x float].
Definition: avxintrin.h:2169
static __inline__ vector float vector float __b
Definition: altivec.h:534
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p)
Loads 4 double-precision floating point values from a 32-byte aligned memory location pointed to by _...
Definition: avxintrin.h:3079
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to an unaligned me...
Definition: avxintrin.h:3265
static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
Moves double-precision values from a 128-bit vector of [2 x double] to a memory location pointed to b...
Definition: avxintrin.h:3453
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit integer vector.
Definition: avxintrin.h:4325
static __inline int __DEFAULT_FN_ATTRS _mm_testz_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2501
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit floating-point vector of [4 x dou...
Definition: avxintrin.h:4342
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b)
Unpacks the even-indexed vector elements from two 256-bit vectors of [4 x double] and interleaves the...
Definition: avxintrin.h:2417
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b)
Divides two 256-bit vectors of [8 x float].
Definition: avxintrin.h:197
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:3020
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b)
Compares two 256-bit vectors of [8 x float] and returns the greater of each pair of values...
Definition: avxintrin.h:235
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1999
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a)
Converts a vector of [8 x i32] into a vector of [8 x float].
Definition: avxintrin.h:2153
static __inline __m128 __DEFAULT_FN_ATTRS _mm_broadcast_ss(float const *__a)
Loads a scalar single-precision floating point value from the specified address pointed to by __a and...
Definition: avxintrin.h:2982
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i const *__p)
Loads 256 bits of integer data from an unaligned memory location pointed to by __p into a 256-bit int...
Definition: avxintrin.h:3189
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one&#39;s complement of the valu...
Definition: avxintrin.h:568
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b)
Divides two 256-bit vectors of [4 x double].
Definition: avxintrin.h:179
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a)
Constructs a 256-bit floating-point vector of [4 x double] from a 128-bit floating-point vector of [2...
Definition: avxintrin.h:4464
static __inline __m128d __DEFAULT_FN_ATTRS _mm_permutevar_pd(__m128d __a, __m128i __c)
Copies the values in a 128-bit vector of [2 x double] as specified by the 128-bit integer vector oper...
Definition: avxintrin.h:784
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p)
Loads 4 double-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3112
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3404
static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_pd(__m128d __a, __m128d __b)
Given two 128-bit floating-point vectors of [2 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2560
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition: avxintrin.h:4376
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_m128(__m128 __lo, __m128 __hi)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:5010
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
Definition: avxintrin.h:661
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a)
Constructs a 256-bit floating-point vector of [8 x float] from a 128-bit floating-point vector of [4 ...
Definition: avxintrin.h:4485
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one&#39;s complement of the value...
Definition: avxintrin.h:589
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b)
Multiplies two 256-bit vectors of [4 x double].
Definition: avxintrin.h:291
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a)
Calculates the square roots of the values in a 256-bit vector of [8 x float].
Definition: avxintrin.h:343
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_m128(__m128 __hi, __m128 __lo)
Constructs a 256-bit floating-point vector of [8 x float] by concatenating two 128-bit floating-point...
Definition: avxintrin.h:4946
#define _mm256_insertf128_pd(V1, V2, M)
Constructs a new 256-bit vector of [4 x double] by first duplicating a 256-bit vector of [4 x double]...
Definition: avxintrin.h:4641
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3953
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b)
Adds two 256-bit vectors of [4 x double].
Definition: avxintrin.h:69
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i *__p, __m256i __a)
Stores integer values from a 256-bit integer vector to an unaligned memory location pointed to by __p...
Definition: avxintrin.h:3303
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p)
Loads 8 single-precision floating point values from an unaligned memory location pointed to by __p in...
Definition: avxintrin.h:3132
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i)
Constructs a 256-bit integer vector of [8 x i32], with each of the 32-bit integral vector elements se...
Definition: avxintrin.h:4191
static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b)
Given two 256-bit floating-point vectors of [4 x double], perform an element-by-element comparison of...
Definition: avxintrin.h:2706
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c)
Copies the values stored in a 256-bit vector of [8 x float] as specified by the 256-bit integer vecto...
Definition: avxintrin.h:968
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32], truncating the result by rounding toward...
Definition: avxintrin.h:2250
static __inline void __DEFAULT_FN_ATTRS _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
Stores the upper and lower 128 bits of a 256-bit floating-point vector of [4 x double] into two diffe...
Definition: avxintrin.h:4891
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a)
Moves single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte aligne...
Definition: avxintrin.h:3562
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a)
Casts a 256-bit floating-point vector of [8 x float] into a 256-bit integer vector.
Definition: avxintrin.h:4359
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m)
Conditionally loads double-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3355
double __v4df __attribute__((__vector_size__(32)))
Definition: avxintrin.h:31
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:643
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x double].
Definition: avxintrin.h:2138
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h)
Constructs a 256-bit floating-point vector of [8 x float], initialized in reverse order with the spec...
Definition: avxintrin.h:3936
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b)
Adds the even-indexed values and subtracts the odd-indexed values of two 256-bit vectors of [8 x floa...
Definition: avxintrin.h:161
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b)
Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the two 256-bit vectors of [8 x float] ...
Definition: avxintrin.h:2471
static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a)
Extracts the sign bits of double-precision floating point elements in a 256-bit vector of [4 x double...
Definition: avxintrin.h:2922
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a)
Converts a vector of [8 x float] into a vector of [8 x i32].
Definition: avxintrin.h:2184
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
Merges 32-bit single-precision data values stored in either of the two 256-bit vectors of [8 x float]...
Definition: avxintrin.h:1420
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4290
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a)
Moves and duplicates odd-indexed values from a 256-bit vector of [8 x float] to float values in a 256...
Definition: avxintrin.h:2325
static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b)
Moves double-precision values from a 256-bit vector of [4 x double] to a 32-byte aligned memory locat...
Definition: avxintrin.h:3541
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, char __b19, char __b18, char __b17, char __b16, char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b09, char __b08, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00)
Constructs a 256-bit integer vector initialized with the specified 8-bit integral values...
Definition: avxintrin.h:3832
static __inline void __DEFAULT_FN_ATTRS _mm256_zeroall(void)
Zeroes the contents of all XMM or YMM registers.
Definition: avxintrin.h:2952
static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b)
Given two 256-bit integer vectors, perform a bit-by-bit comparison of the two source vectors...
Definition: avxintrin.h:2903
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3987
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a)
Casts a 256-bit floating-point vector of [4 x double] into a 256-bit floating-point vector of [8 x fl...
Definition: avxintrin.h:4308
static __inline int __DEFAULT_FN_ATTRS _mm_testc_ps(__m128 __a, __m128 __b)
Given two 128-bit floating-point vectors of [4 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2618
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w)
Constructs a 256-bit floating-point vector of [4 x double], with each of the four double-precision fl...
Definition: avxintrin.h:4153
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: avxintrin.h:3968
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
Constructs a 256-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: avxintrin.h:4134
static __inline __m128 __DEFAULT_FN_ATTRS _mm_maskload_ps(float const *__p, __m128i __m)
Conditionally loads single-precision floating point elements from a memory location pointed to by __p...
Definition: avxintrin.h:3380
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1752
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
Definition: avxintrin.h:4263
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the greater of each pair of values...
Definition: avxintrin.h:216
#define __DEFAULT_FN_ATTRS
Definition: avxintrin.h:53
static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
Moves single-precision floating point values from a 128-bit vector of [4 x float] to a memory locatio...
Definition: avxintrin.h:3501
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4199
static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a)
Stores single-precision floating point values from a 256-bit vector of [8 x float] to a 32-byte align...
Definition: avxintrin.h:3227
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a)
Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 x double].
Definition: avxintrin.h:2200
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
Definition: avxintrin.h:607
static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b)
Given two 256-bit floating-point vectors of [8 x float], perform an element-by-element comparison of ...
Definition: avxintrin.h:2765
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b)
Compares two 256-bit vectors of [4 x double] and returns the lesser of each pair of values...
Definition: avxintrin.h:254
static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
Definition: avxintrin.h:4427