clang 20.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26/* Type defines. */
27typedef double __v2df __attribute__((__vector_size__(16)));
28typedef long long __v2di __attribute__((__vector_size__(16)));
29typedef short __v8hi __attribute__((__vector_size__(16)));
30typedef char __v16qi __attribute__((__vector_size__(16)));
31
32/* Unsigned types */
33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37/* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41#ifdef __SSE2__
42/* Both _Float16 and __bf16 require SSE2 being enabled. */
43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49#endif
50
51/* Define the default attributes for the functions in this file. */
52#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, \
55 __target__("sse2,no-evex512"), __min_vector_width__(128)))
56#else
57#define __DEFAULT_FN_ATTRS \
58 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
59 __min_vector_width__(128)))
60#endif
61
62#if defined(__cplusplus) && (__cplusplus >= 201103L)
63#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
64#else
65#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
66#endif
67
68#define __trunc64(x) \
69 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
70#define __anyext128(x) \
71 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
72 1, -1, -1)
73
74/// Adds lower double-precision values in both operands and returns the
75/// sum in the lower 64 bits of the result. The upper 64 bits of the result
76/// are copied from the upper double-precision value of the first operand.
77///
78/// \headerfile <x86intrin.h>
79///
80/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
81///
82/// \param __a
83/// A 128-bit vector of [2 x double] containing one of the source operands.
84/// \param __b
85/// A 128-bit vector of [2 x double] containing one of the source operands.
86/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
87/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
88/// from the upper 64 bits of the first source operand.
89static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a,
90 __m128d __b) {
91 __a[0] += __b[0];
92 return __a;
93}
94
95/// Adds two 128-bit vectors of [2 x double].
96///
97/// \headerfile <x86intrin.h>
98///
99/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
100///
101/// \param __a
102/// A 128-bit vector of [2 x double] containing one of the source operands.
103/// \param __b
104/// A 128-bit vector of [2 x double] containing one of the source operands.
105/// \returns A 128-bit vector of [2 x double] containing the sums of both
106/// operands.
107static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a,
108 __m128d __b) {
109 return (__m128d)((__v2df)__a + (__v2df)__b);
110}
111
112/// Subtracts the lower double-precision value of the second operand
113/// from the lower double-precision value of the first operand and returns
114/// the difference in the lower 64 bits of the result. The upper 64 bits of
115/// the result are copied from the upper double-precision value of the first
116/// operand.
117///
118/// \headerfile <x86intrin.h>
119///
120/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
121///
122/// \param __a
123/// A 128-bit vector of [2 x double] containing the minuend.
124/// \param __b
125/// A 128-bit vector of [2 x double] containing the subtrahend.
126/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
127/// difference of the lower 64 bits of both operands. The upper 64 bits are
128/// copied from the upper 64 bits of the first source operand.
129static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a,
130 __m128d __b) {
131 __a[0] -= __b[0];
132 return __a;
133}
134
135/// Subtracts two 128-bit vectors of [2 x double].
136///
137/// \headerfile <x86intrin.h>
138///
139/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
140///
141/// \param __a
142/// A 128-bit vector of [2 x double] containing the minuend.
143/// \param __b
144/// A 128-bit vector of [2 x double] containing the subtrahend.
145/// \returns A 128-bit vector of [2 x double] containing the differences between
146/// both operands.
147static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a,
148 __m128d __b) {
149 return (__m128d)((__v2df)__a - (__v2df)__b);
150}
151
152/// Multiplies lower double-precision values in both operands and returns
153/// the product in the lower 64 bits of the result. The upper 64 bits of the
154/// result are copied from the upper double-precision value of the first
155/// operand.
156///
157/// \headerfile <x86intrin.h>
158///
159/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
160///
161/// \param __a
162/// A 128-bit vector of [2 x double] containing one of the source operands.
163/// \param __b
164/// A 128-bit vector of [2 x double] containing one of the source operands.
165/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
166/// product of the lower 64 bits of both operands. The upper 64 bits are
167/// copied from the upper 64 bits of the first source operand.
168static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a,
169 __m128d __b) {
170 __a[0] *= __b[0];
171 return __a;
172}
173
174/// Multiplies two 128-bit vectors of [2 x double].
175///
176/// \headerfile <x86intrin.h>
177///
178/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
179///
180/// \param __a
181/// A 128-bit vector of [2 x double] containing one of the operands.
182/// \param __b
183/// A 128-bit vector of [2 x double] containing one of the operands.
184/// \returns A 128-bit vector of [2 x double] containing the products of both
185/// operands.
186static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a,
187 __m128d __b) {
188 return (__m128d)((__v2df)__a * (__v2df)__b);
189}
190
191/// Divides the lower double-precision value of the first operand by the
192/// lower double-precision value of the second operand and returns the
193/// quotient in the lower 64 bits of the result. The upper 64 bits of the
194/// result are copied from the upper double-precision value of the first
195/// operand.
196///
197/// \headerfile <x86intrin.h>
198///
199/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
200///
201/// \param __a
202/// A 128-bit vector of [2 x double] containing the dividend.
203/// \param __b
204/// A 128-bit vector of [2 x double] containing divisor.
205/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
206/// quotient of the lower 64 bits of both operands. The upper 64 bits are
207/// copied from the upper 64 bits of the first source operand.
208static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a,
209 __m128d __b) {
210 __a[0] /= __b[0];
211 return __a;
212}
213
214/// Performs an element-by-element division of two 128-bit vectors of
215/// [2 x double].
216///
217/// \headerfile <x86intrin.h>
218///
219/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
220///
221/// \param __a
222/// A 128-bit vector of [2 x double] containing the dividend.
223/// \param __b
224/// A 128-bit vector of [2 x double] containing the divisor.
225/// \returns A 128-bit vector of [2 x double] containing the quotients of both
226/// operands.
227static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
228 __m128d __b) {
229 return (__m128d)((__v2df)__a / (__v2df)__b);
230}
231
232/// Calculates the square root of the lower double-precision value of
233/// the second operand and returns it in the lower 64 bits of the result.
234/// The upper 64 bits of the result are copied from the upper
235/// double-precision value of the first operand.
236///
237/// \headerfile <x86intrin.h>
238///
239/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
240///
241/// \param __a
242/// A 128-bit vector of [2 x double] containing one of the operands. The
243/// upper 64 bits of this operand are copied to the upper 64 bits of the
244/// result.
245/// \param __b
246/// A 128-bit vector of [2 x double] containing one of the operands. The
247/// square root is calculated using the lower 64 bits of this operand.
248/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
249/// square root of the lower 64 bits of operand \a __b, and whose upper 64
250/// bits are copied from the upper 64 bits of operand \a __a.
251static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
252 __m128d __b) {
253 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
254 return __extension__(__m128d){__c[0], __a[1]};
255}
256
257/// Calculates the square root of the each of two values stored in a
258/// 128-bit vector of [2 x double].
259///
260/// \headerfile <x86intrin.h>
261///
262/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
263///
264/// \param __a
265/// A 128-bit vector of [2 x double].
266/// \returns A 128-bit vector of [2 x double] containing the square roots of the
267/// values in the operand.
268static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
269 return __builtin_ia32_sqrtpd((__v2df)__a);
270}
271
272/// Compares lower 64-bit double-precision values of both operands, and
273/// returns the lesser of the pair of values in the lower 64-bits of the
274/// result. The upper 64 bits of the result are copied from the upper
275/// double-precision value of the first operand.
276///
277/// If either value in a comparison is NaN, returns the value from \a __b.
278///
279/// \headerfile <x86intrin.h>
280///
281/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
282///
283/// \param __a
284/// A 128-bit vector of [2 x double] containing one of the operands. The
285/// lower 64 bits of this operand are used in the comparison.
286/// \param __b
287/// A 128-bit vector of [2 x double] containing one of the operands. The
288/// lower 64 bits of this operand are used in the comparison.
289/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
290/// minimum value between both operands. The upper 64 bits are copied from
291/// the upper 64 bits of the first source operand.
292static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
293 __m128d __b) {
294 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
295}
296
297/// Performs element-by-element comparison of the two 128-bit vectors of
298/// [2 x double] and returns a vector containing the lesser of each pair of
299/// values.
300///
301/// If either value in a comparison is NaN, returns the value from \a __b.
302///
303/// \headerfile <x86intrin.h>
304///
305/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
306///
307/// \param __a
308/// A 128-bit vector of [2 x double] containing one of the operands.
309/// \param __b
310/// A 128-bit vector of [2 x double] containing one of the operands.
311/// \returns A 128-bit vector of [2 x double] containing the minimum values
312/// between both operands.
313static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
314 __m128d __b) {
315 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
316}
317
318/// Compares lower 64-bit double-precision values of both operands, and
319/// returns the greater of the pair of values in the lower 64-bits of the
320/// result. The upper 64 bits of the result are copied from the upper
321/// double-precision value of the first operand.
322///
323/// If either value in a comparison is NaN, returns the value from \a __b.
324///
325/// \headerfile <x86intrin.h>
326///
327/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
328///
329/// \param __a
330/// A 128-bit vector of [2 x double] containing one of the operands. The
331/// lower 64 bits of this operand are used in the comparison.
332/// \param __b
333/// A 128-bit vector of [2 x double] containing one of the operands. The
334/// lower 64 bits of this operand are used in the comparison.
335/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
336/// maximum value between both operands. The upper 64 bits are copied from
337/// the upper 64 bits of the first source operand.
338static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
339 __m128d __b) {
340 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
341}
342
343/// Performs element-by-element comparison of the two 128-bit vectors of
344/// [2 x double] and returns a vector containing the greater of each pair
345/// of values.
346///
347/// If either value in a comparison is NaN, returns the value from \a __b.
348///
349/// \headerfile <x86intrin.h>
350///
351/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
352///
353/// \param __a
354/// A 128-bit vector of [2 x double] containing one of the operands.
355/// \param __b
356/// A 128-bit vector of [2 x double] containing one of the operands.
357/// \returns A 128-bit vector of [2 x double] containing the maximum values
358/// between both operands.
359static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
360 __m128d __b) {
361 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
362}
363
364/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
365///
366/// \headerfile <x86intrin.h>
367///
368/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
369///
370/// \param __a
371/// A 128-bit vector of [2 x double] containing one of the source operands.
372/// \param __b
373/// A 128-bit vector of [2 x double] containing one of the source operands.
374/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
375/// values between both operands.
376static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a,
377 __m128d __b) {
378 return (__m128d)((__v2du)__a & (__v2du)__b);
379}
380
381/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
382/// the one's complement of the values contained in the first source operand.
383///
384/// \headerfile <x86intrin.h>
385///
386/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
387///
388/// \param __a
389/// A 128-bit vector of [2 x double] containing the left source operand. The
390/// one's complement of this value is used in the bitwise AND.
391/// \param __b
392/// A 128-bit vector of [2 x double] containing the right source operand.
393/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
394/// values in the second operand and the one's complement of the first
395/// operand.
396static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
397_mm_andnot_pd(__m128d __a, __m128d __b) {
398 return (__m128d)(~(__v2du)__a & (__v2du)__b);
399}
400
401/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
402///
403/// \headerfile <x86intrin.h>
404///
405/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
406///
407/// \param __a
408/// A 128-bit vector of [2 x double] containing one of the source operands.
409/// \param __b
410/// A 128-bit vector of [2 x double] containing one of the source operands.
411/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
412/// values between both operands.
413static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a,
414 __m128d __b) {
415 return (__m128d)((__v2du)__a | (__v2du)__b);
416}
417
418/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
419///
420/// \headerfile <x86intrin.h>
421///
422/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
423///
424/// \param __a
425/// A 128-bit vector of [2 x double] containing one of the source operands.
426/// \param __b
427/// A 128-bit vector of [2 x double] containing one of the source operands.
428/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
429/// values between both operands.
430static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a,
431 __m128d __b) {
432 return (__m128d)((__v2du)__a ^ (__v2du)__b);
433}
434
435/// Compares each of the corresponding double-precision values of the
436/// 128-bit vectors of [2 x double] for equality.
437///
438/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
439/// If either value in a comparison is NaN, returns false.
440///
441/// \headerfile <x86intrin.h>
442///
443/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
444///
445/// \param __a
446/// A 128-bit vector of [2 x double].
447/// \param __b
448/// A 128-bit vector of [2 x double].
449/// \returns A 128-bit vector containing the comparison results.
450static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
451 __m128d __b) {
452 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
453}
454
455/// Compares each of the corresponding double-precision values of the
456/// 128-bit vectors of [2 x double] to determine if the values in the first
457/// operand are less than those in the second operand.
458///
459/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
460/// If either value in a comparison is NaN, returns false.
461///
462/// \headerfile <x86intrin.h>
463///
464/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
465///
466/// \param __a
467/// A 128-bit vector of [2 x double].
468/// \param __b
469/// A 128-bit vector of [2 x double].
470/// \returns A 128-bit vector containing the comparison results.
471static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
472 __m128d __b) {
473 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
474}
475
476/// Compares each of the corresponding double-precision values of the
477/// 128-bit vectors of [2 x double] to determine if the values in the first
478/// operand are less than or equal to those in the second operand.
479///
480/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
481/// If either value in a comparison is NaN, returns false.
482///
483/// \headerfile <x86intrin.h>
484///
485/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
486///
487/// \param __a
488/// A 128-bit vector of [2 x double].
489/// \param __b
490/// A 128-bit vector of [2 x double].
491/// \returns A 128-bit vector containing the comparison results.
492static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
493 __m128d __b) {
494 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
495}
496
497/// Compares each of the corresponding double-precision values of the
498/// 128-bit vectors of [2 x double] to determine if the values in the first
499/// operand are greater than those in the second operand.
500///
501/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
502/// If either value in a comparison is NaN, returns false.
503///
504/// \headerfile <x86intrin.h>
505///
506/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
507///
508/// \param __a
509/// A 128-bit vector of [2 x double].
510/// \param __b
511/// A 128-bit vector of [2 x double].
512/// \returns A 128-bit vector containing the comparison results.
513static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
514 __m128d __b) {
515 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
516}
517
518/// Compares each of the corresponding double-precision values of the
519/// 128-bit vectors of [2 x double] to determine if the values in the first
520/// operand are greater than or equal to those in the second operand.
521///
522/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
523/// If either value in a comparison is NaN, returns false.
524///
525/// \headerfile <x86intrin.h>
526///
527/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
528///
529/// \param __a
530/// A 128-bit vector of [2 x double].
531/// \param __b
532/// A 128-bit vector of [2 x double].
533/// \returns A 128-bit vector containing the comparison results.
534static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
535 __m128d __b) {
536 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
537}
538
539/// Compares each of the corresponding double-precision values of the
540/// 128-bit vectors of [2 x double] to determine if the values in the first
541/// operand are ordered with respect to those in the second operand.
542///
543/// A pair of double-precision values are ordered with respect to each
544/// other if neither value is a NaN. Each comparison returns 0x0 for false,
545/// 0xFFFFFFFFFFFFFFFF for true.
546///
547/// \headerfile <x86intrin.h>
548///
549/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
550///
551/// \param __a
552/// A 128-bit vector of [2 x double].
553/// \param __b
554/// A 128-bit vector of [2 x double].
555/// \returns A 128-bit vector containing the comparison results.
556static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
557 __m128d __b) {
558 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
559}
560
561/// Compares each of the corresponding double-precision values of the
562/// 128-bit vectors of [2 x double] to determine if the values in the first
563/// operand are unordered with respect to those in the second operand.
564///
565/// A pair of double-precision values are unordered with respect to each
566/// other if one or both values are NaN. Each comparison returns 0x0 for
567/// false, 0xFFFFFFFFFFFFFFFF for true.
568///
569/// \headerfile <x86intrin.h>
570///
571/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
572/// instruction.
573///
574/// \param __a
575/// A 128-bit vector of [2 x double].
576/// \param __b
577/// A 128-bit vector of [2 x double].
578/// \returns A 128-bit vector containing the comparison results.
579static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
580 __m128d __b) {
581 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
582}
583
584/// Compares each of the corresponding double-precision values of the
585/// 128-bit vectors of [2 x double] to determine if the values in the first
586/// operand are unequal to those in the second operand.
587///
588/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
589/// If either value in a comparison is NaN, returns true.
590///
591/// \headerfile <x86intrin.h>
592///
593/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
594///
595/// \param __a
596/// A 128-bit vector of [2 x double].
597/// \param __b
598/// A 128-bit vector of [2 x double].
599/// \returns A 128-bit vector containing the comparison results.
600static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
601 __m128d __b) {
602 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
603}
604
605/// Compares each of the corresponding double-precision values of the
606/// 128-bit vectors of [2 x double] to determine if the values in the first
607/// operand are not less than those in the second operand.
608///
609/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
610/// If either value in a comparison is NaN, returns true.
611///
612/// \headerfile <x86intrin.h>
613///
614/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
615///
616/// \param __a
617/// A 128-bit vector of [2 x double].
618/// \param __b
619/// A 128-bit vector of [2 x double].
620/// \returns A 128-bit vector containing the comparison results.
621static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
622 __m128d __b) {
623 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
624}
625
626/// Compares each of the corresponding double-precision values of the
627/// 128-bit vectors of [2 x double] to determine if the values in the first
628/// operand are not less than or equal to those in the second operand.
629///
630/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
631/// If either value in a comparison is NaN, returns true.
632///
633/// \headerfile <x86intrin.h>
634///
635/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
636///
637/// \param __a
638/// A 128-bit vector of [2 x double].
639/// \param __b
640/// A 128-bit vector of [2 x double].
641/// \returns A 128-bit vector containing the comparison results.
642static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
643 __m128d __b) {
644 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
645}
646
647/// Compares each of the corresponding double-precision values of the
648/// 128-bit vectors of [2 x double] to determine if the values in the first
649/// operand are not greater than those in the second operand.
650///
651/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
652/// If either value in a comparison is NaN, returns true.
653///
654/// \headerfile <x86intrin.h>
655///
656/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
657///
658/// \param __a
659/// A 128-bit vector of [2 x double].
660/// \param __b
661/// A 128-bit vector of [2 x double].
662/// \returns A 128-bit vector containing the comparison results.
663static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
664 __m128d __b) {
665 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
666}
667
668/// Compares each of the corresponding double-precision values of the
669/// 128-bit vectors of [2 x double] to determine if the values in the first
670/// operand are not greater than or equal to those in the second operand.
671///
672/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
673/// If either value in a comparison is NaN, returns true.
674///
675/// \headerfile <x86intrin.h>
676///
677/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
678///
679/// \param __a
680/// A 128-bit vector of [2 x double].
681/// \param __b
682/// A 128-bit vector of [2 x double].
683/// \returns A 128-bit vector containing the comparison results.
684static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
685 __m128d __b) {
686 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
687}
688
689/// Compares the lower double-precision floating-point values in each of
690/// the two 128-bit floating-point vectors of [2 x double] for equality.
691///
692/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
693/// If either value in a comparison is NaN, returns false.
694///
695/// \headerfile <x86intrin.h>
696///
697/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
698///
699/// \param __a
700/// A 128-bit vector of [2 x double]. The lower double-precision value is
701/// compared to the lower double-precision value of \a __b.
702/// \param __b
703/// A 128-bit vector of [2 x double]. The lower double-precision value is
704/// compared to the lower double-precision value of \a __a.
705/// \returns A 128-bit vector. The lower 64 bits contains the comparison
706/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
707static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
708 __m128d __b) {
709 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
710}
711
712/// Compares the lower double-precision floating-point values in each of
713/// the two 128-bit floating-point vectors of [2 x double] to determine if
714/// the value in the first parameter is less than the corresponding value in
715/// the second parameter.
716///
717/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
718/// If either value in a comparison is NaN, returns false.
719///
720/// \headerfile <x86intrin.h>
721///
722/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
723///
724/// \param __a
725/// A 128-bit vector of [2 x double]. The lower double-precision value is
726/// compared to the lower double-precision value of \a __b.
727/// \param __b
728/// A 128-bit vector of [2 x double]. The lower double-precision value is
729/// compared to the lower double-precision value of \a __a.
730/// \returns A 128-bit vector. The lower 64 bits contains the comparison
731/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
732static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
733 __m128d __b) {
734 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
735}
736
737/// Compares the lower double-precision floating-point values in each of
738/// the two 128-bit floating-point vectors of [2 x double] to determine if
739/// the value in the first parameter is less than or equal to the
740/// corresponding value in the second parameter.
741///
742/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
743/// If either value in a comparison is NaN, returns false.
744///
745/// \headerfile <x86intrin.h>
746///
747/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
748///
749/// \param __a
750/// A 128-bit vector of [2 x double]. The lower double-precision value is
751/// compared to the lower double-precision value of \a __b.
752/// \param __b
753/// A 128-bit vector of [2 x double]. The lower double-precision value is
754/// compared to the lower double-precision value of \a __a.
755/// \returns A 128-bit vector. The lower 64 bits contains the comparison
756/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
757static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
758 __m128d __b) {
759 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
760}
761
762/// Compares the lower double-precision floating-point values in each of
763/// the two 128-bit floating-point vectors of [2 x double] to determine if
764/// the value in the first parameter is greater than the corresponding value
765/// in the second parameter.
766///
767/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
768/// If either value in a comparison is NaN, returns false.
769///
770/// \headerfile <x86intrin.h>
771///
772/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
773///
774/// \param __a
775/// A 128-bit vector of [2 x double]. The lower double-precision value is
776/// compared to the lower double-precision value of \a __b.
777/// \param __b
778/// A 128-bit vector of [2 x double]. The lower double-precision value is
779/// compared to the lower double-precision value of \a __a.
780/// \returns A 128-bit vector. The lower 64 bits contains the comparison
781/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
782static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
783 __m128d __b) {
784 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
785 return __extension__(__m128d){__c[0], __a[1]};
786}
787
788/// Compares the lower double-precision floating-point values in each of
789/// the two 128-bit floating-point vectors of [2 x double] to determine if
790/// the value in the first parameter is greater than or equal to the
791/// corresponding value in the second parameter.
792///
793/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
794/// If either value in a comparison is NaN, returns false.
795///
796/// \headerfile <x86intrin.h>
797///
798/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
799///
800/// \param __a
801/// A 128-bit vector of [2 x double]. The lower double-precision value is
802/// compared to the lower double-precision value of \a __b.
803/// \param __b
804/// A 128-bit vector of [2 x double]. The lower double-precision value is
805/// compared to the lower double-precision value of \a __a.
806/// \returns A 128-bit vector. The lower 64 bits contains the comparison
807/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
808static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
809 __m128d __b) {
810 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
811 return __extension__(__m128d){__c[0], __a[1]};
812}
813
814/// Compares the lower double-precision floating-point values in each of
815/// the two 128-bit floating-point vectors of [2 x double] to determine if
816/// the value in the first parameter is ordered with respect to the
817/// corresponding value in the second parameter.
818///
819/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
820/// of double-precision values are ordered with respect to each other if
821/// neither value is a NaN.
822///
823/// \headerfile <x86intrin.h>
824///
825/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
826///
827/// \param __a
828/// A 128-bit vector of [2 x double]. The lower double-precision value is
829/// compared to the lower double-precision value of \a __b.
830/// \param __b
831/// A 128-bit vector of [2 x double]. The lower double-precision value is
832/// compared to the lower double-precision value of \a __a.
833/// \returns A 128-bit vector. The lower 64 bits contains the comparison
834/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
835static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
836 __m128d __b) {
837 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
838}
839
840/// Compares the lower double-precision floating-point values in each of
841/// the two 128-bit floating-point vectors of [2 x double] to determine if
842/// the value in the first parameter is unordered with respect to the
843/// corresponding value in the second parameter.
844///
845/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
846/// of double-precision values are unordered with respect to each other if
847/// one or both values are NaN.
848///
849/// \headerfile <x86intrin.h>
850///
851/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
852/// instruction.
853///
854/// \param __a
855/// A 128-bit vector of [2 x double]. The lower double-precision value is
856/// compared to the lower double-precision value of \a __b.
857/// \param __b
858/// A 128-bit vector of [2 x double]. The lower double-precision value is
859/// compared to the lower double-precision value of \a __a.
860/// \returns A 128-bit vector. The lower 64 bits contains the comparison
861/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
862static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
863 __m128d __b) {
864 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
865}
866
867/// Compares the lower double-precision floating-point values in each of
868/// the two 128-bit floating-point vectors of [2 x double] to determine if
869/// the value in the first parameter is unequal to the corresponding value in
870/// the second parameter.
871///
872/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
873/// If either value in a comparison is NaN, returns true.
874///
875/// \headerfile <x86intrin.h>
876///
877/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
878///
879/// \param __a
880/// A 128-bit vector of [2 x double]. The lower double-precision value is
881/// compared to the lower double-precision value of \a __b.
882/// \param __b
883/// A 128-bit vector of [2 x double]. The lower double-precision value is
884/// compared to the lower double-precision value of \a __a.
885/// \returns A 128-bit vector. The lower 64 bits contains the comparison
886/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
887static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
888 __m128d __b) {
889 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
890}
891
892/// Compares the lower double-precision floating-point values in each of
893/// the two 128-bit floating-point vectors of [2 x double] to determine if
894/// the value in the first parameter is not less than the corresponding
895/// value in the second parameter.
896///
897/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
898/// If either value in a comparison is NaN, returns true.
899///
900/// \headerfile <x86intrin.h>
901///
902/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
903///
904/// \param __a
905/// A 128-bit vector of [2 x double]. The lower double-precision value is
906/// compared to the lower double-precision value of \a __b.
907/// \param __b
908/// A 128-bit vector of [2 x double]. The lower double-precision value is
909/// compared to the lower double-precision value of \a __a.
910/// \returns A 128-bit vector. The lower 64 bits contains the comparison
911/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
912static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
913 __m128d __b) {
914 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
915}
916
917/// Compares the lower double-precision floating-point values in each of
918/// the two 128-bit floating-point vectors of [2 x double] to determine if
919/// the value in the first parameter is not less than or equal to the
920/// corresponding value in the second parameter.
921///
922/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
923/// If either value in a comparison is NaN, returns true.
924///
925/// \headerfile <x86intrin.h>
926///
927/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
928///
929/// \param __a
930/// A 128-bit vector of [2 x double]. The lower double-precision value is
931/// compared to the lower double-precision value of \a __b.
932/// \param __b
933/// A 128-bit vector of [2 x double]. The lower double-precision value is
934/// compared to the lower double-precision value of \a __a.
935/// \returns A 128-bit vector. The lower 64 bits contains the comparison
936/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
937static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
938 __m128d __b) {
939 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
940}
941
942/// Compares the lower double-precision floating-point values in each of
943/// the two 128-bit floating-point vectors of [2 x double] to determine if
944/// the value in the first parameter is not greater than the corresponding
945/// value in the second parameter.
946///
947/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
948/// If either value in a comparison is NaN, returns true.
949///
950/// \headerfile <x86intrin.h>
951///
952/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
953///
954/// \param __a
955/// A 128-bit vector of [2 x double]. The lower double-precision value is
956/// compared to the lower double-precision value of \a __b.
957/// \param __b
958/// A 128-bit vector of [2 x double]. The lower double-precision value is
959/// compared to the lower double-precision value of \a __a.
960/// \returns A 128-bit vector. The lower 64 bits contains the comparison
961/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
962static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
963 __m128d __b) {
964 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
965 return __extension__(__m128d){__c[0], __a[1]};
966}
967
968/// Compares the lower double-precision floating-point values in each of
969/// the two 128-bit floating-point vectors of [2 x double] to determine if
970/// the value in the first parameter is not greater than or equal to the
971/// corresponding value in the second parameter.
972///
973/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
974/// If either value in a comparison is NaN, returns true.
975///
976/// \headerfile <x86intrin.h>
977///
978/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
979///
980/// \param __a
981/// A 128-bit vector of [2 x double]. The lower double-precision value is
982/// compared to the lower double-precision value of \a __b.
983/// \param __b
984/// A 128-bit vector of [2 x double]. The lower double-precision value is
985/// compared to the lower double-precision value of \a __a.
986/// \returns A 128-bit vector. The lower 64 bits contains the comparison
987/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
988static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
989 __m128d __b) {
990 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
991 return __extension__(__m128d){__c[0], __a[1]};
992}
993
994/// Compares the lower double-precision floating-point values in each of
995/// the two 128-bit floating-point vectors of [2 x double] for equality.
996///
997/// The comparison returns 0 for false, 1 for true. If either value in a
998/// comparison is NaN, returns 0.
999///
1000/// \headerfile <x86intrin.h>
1001///
1002/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1003///
1004/// \param __a
1005/// A 128-bit vector of [2 x double]. The lower double-precision value is
1006/// compared to the lower double-precision value of \a __b.
1007/// \param __b
1008/// A 128-bit vector of [2 x double]. The lower double-precision value is
1009/// compared to the lower double-precision value of \a __a.
1010/// \returns An integer containing the comparison results.
1011static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1012 __m128d __b) {
1013 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1014}
1015
1016/// Compares the lower double-precision floating-point values in each of
1017/// the two 128-bit floating-point vectors of [2 x double] to determine if
1018/// the value in the first parameter is less than the corresponding value in
1019/// the second parameter.
1020///
1021/// The comparison returns 0 for false, 1 for true. If either value in a
1022/// comparison is NaN, returns 0.
1023///
1024/// \headerfile <x86intrin.h>
1025///
1026/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1027///
1028/// \param __a
1029/// A 128-bit vector of [2 x double]. The lower double-precision value is
1030/// compared to the lower double-precision value of \a __b.
1031/// \param __b
1032/// A 128-bit vector of [2 x double]. The lower double-precision value is
1033/// compared to the lower double-precision value of \a __a.
1034/// \returns An integer containing the comparison results.
1035static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1036 __m128d __b) {
1037 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1038}
1039
1040/// Compares the lower double-precision floating-point values in each of
1041/// the two 128-bit floating-point vectors of [2 x double] to determine if
1042/// the value in the first parameter is less than or equal to the
1043/// corresponding value in the second parameter.
1044///
1045/// The comparison returns 0 for false, 1 for true. If either value in a
1046/// comparison is NaN, returns 0.
1047///
1048/// \headerfile <x86intrin.h>
1049///
1050/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1051///
1052/// \param __a
1053/// A 128-bit vector of [2 x double]. The lower double-precision value is
1054/// compared to the lower double-precision value of \a __b.
1055/// \param __b
1056/// A 128-bit vector of [2 x double]. The lower double-precision value is
1057/// compared to the lower double-precision value of \a __a.
1058/// \returns An integer containing the comparison results.
1059static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1060 __m128d __b) {
1061 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1062}
1063
1064/// Compares the lower double-precision floating-point values in each of
1065/// the two 128-bit floating-point vectors of [2 x double] to determine if
1066/// the value in the first parameter is greater than the corresponding value
1067/// in the second parameter.
1068///
1069/// The comparison returns 0 for false, 1 for true. If either value in a
1070/// comparison is NaN, returns 0.
1071///
1072/// \headerfile <x86intrin.h>
1073///
1074/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1075///
1076/// \param __a
1077/// A 128-bit vector of [2 x double]. The lower double-precision value is
1078/// compared to the lower double-precision value of \a __b.
1079/// \param __b
1080/// A 128-bit vector of [2 x double]. The lower double-precision value is
1081/// compared to the lower double-precision value of \a __a.
1082/// \returns An integer containing the comparison results.
1083static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1084 __m128d __b) {
1085 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1086}
1087
1088/// Compares the lower double-precision floating-point values in each of
1089/// the two 128-bit floating-point vectors of [2 x double] to determine if
1090/// the value in the first parameter is greater than or equal to the
1091/// corresponding value in the second parameter.
1092///
1093/// The comparison returns 0 for false, 1 for true. If either value in a
1094/// comparison is NaN, returns 0.
1095///
1096/// \headerfile <x86intrin.h>
1097///
1098/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1099///
1100/// \param __a
1101/// A 128-bit vector of [2 x double]. The lower double-precision value is
1102/// compared to the lower double-precision value of \a __b.
1103/// \param __b
1104/// A 128-bit vector of [2 x double]. The lower double-precision value is
1105/// compared to the lower double-precision value of \a __a.
1106/// \returns An integer containing the comparison results.
1107static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1108 __m128d __b) {
1109 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1110}
1111
1112/// Compares the lower double-precision floating-point values in each of
1113/// the two 128-bit floating-point vectors of [2 x double] to determine if
1114/// the value in the first parameter is unequal to the corresponding value in
1115/// the second parameter.
1116///
1117/// The comparison returns 0 for false, 1 for true. If either value in a
1118/// comparison is NaN, returns 1.
1119///
1120/// \headerfile <x86intrin.h>
1121///
1122/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1123///
1124/// \param __a
1125/// A 128-bit vector of [2 x double]. The lower double-precision value is
1126/// compared to the lower double-precision value of \a __b.
1127/// \param __b
1128/// A 128-bit vector of [2 x double]. The lower double-precision value is
1129/// compared to the lower double-precision value of \a __a.
1130/// \returns An integer containing the comparison results.
1131static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1132 __m128d __b) {
1133 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1134}
1135
1136/// Compares the lower double-precision floating-point values in each of
1137/// the two 128-bit floating-point vectors of [2 x double] for equality.
1138///
1139/// The comparison returns 0 for false, 1 for true. If either value in a
1140/// comparison is NaN, returns 0.
1141///
1142/// \headerfile <x86intrin.h>
1143///
1144/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1145///
1146/// \param __a
1147/// A 128-bit vector of [2 x double]. The lower double-precision value is
1148/// compared to the lower double-precision value of \a __b.
1149/// \param __b
1150/// A 128-bit vector of [2 x double]. The lower double-precision value is
1151/// compared to the lower double-precision value of \a __a.
1152/// \returns An integer containing the comparison results.
1153static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1154 __m128d __b) {
1155 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1156}
1157
1158/// Compares the lower double-precision floating-point values in each of
1159/// the two 128-bit floating-point vectors of [2 x double] to determine if
1160/// the value in the first parameter is less than the corresponding value in
1161/// the second parameter.
1162///
1163/// The comparison returns 0 for false, 1 for true. If either value in a
1164/// comparison is NaN, returns 0.
1165///
1166/// \headerfile <x86intrin.h>
1167///
1168/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1169///
1170/// \param __a
1171/// A 128-bit vector of [2 x double]. The lower double-precision value is
1172/// compared to the lower double-precision value of \a __b.
1173/// \param __b
1174/// A 128-bit vector of [2 x double]. The lower double-precision value is
1175/// compared to the lower double-precision value of \a __a.
1176/// \returns An integer containing the comparison results.
1177static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1178 __m128d __b) {
1179 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1180}
1181
1182/// Compares the lower double-precision floating-point values in each of
1183/// the two 128-bit floating-point vectors of [2 x double] to determine if
1184/// the value in the first parameter is less than or equal to the
1185/// corresponding value in the second parameter.
1186///
1187/// The comparison returns 0 for false, 1 for true. If either value in a
1188/// comparison is NaN, returns 0.
1189///
1190/// \headerfile <x86intrin.h>
1191///
1192/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1193///
1194/// \param __a
1195/// A 128-bit vector of [2 x double]. The lower double-precision value is
1196/// compared to the lower double-precision value of \a __b.
1197/// \param __b
1198/// A 128-bit vector of [2 x double]. The lower double-precision value is
1199/// compared to the lower double-precision value of \a __a.
1200/// \returns An integer containing the comparison results.
1201static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1202 __m128d __b) {
1203 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1204}
1205
1206/// Compares the lower double-precision floating-point values in each of
1207/// the two 128-bit floating-point vectors of [2 x double] to determine if
1208/// the value in the first parameter is greater than the corresponding value
1209/// in the second parameter.
1210///
1211/// The comparison returns 0 for false, 1 for true. If either value in a
1212/// comparison is NaN, returns 0.
1213///
1214/// \headerfile <x86intrin.h>
1215///
1216/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1217///
1218/// \param __a
1219/// A 128-bit vector of [2 x double]. The lower double-precision value is
1220/// compared to the lower double-precision value of \a __b.
1221/// \param __b
1222/// A 128-bit vector of [2 x double]. The lower double-precision value is
1223/// compared to the lower double-precision value of \a __a.
1224/// \returns An integer containing the comparison results.
1225static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1226 __m128d __b) {
1227 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1228}
1229
1230/// Compares the lower double-precision floating-point values in each of
1231/// the two 128-bit floating-point vectors of [2 x double] to determine if
1232/// the value in the first parameter is greater than or equal to the
1233/// corresponding value in the second parameter.
1234///
1235/// The comparison returns 0 for false, 1 for true. If either value in a
1236/// comparison is NaN, returns 0.
1237///
1238/// \headerfile <x86intrin.h>
1239///
1240/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1241///
1242/// \param __a
1243/// A 128-bit vector of [2 x double]. The lower double-precision value is
1244/// compared to the lower double-precision value of \a __b.
1245/// \param __b
1246/// A 128-bit vector of [2 x double]. The lower double-precision value is
1247/// compared to the lower double-precision value of \a __a.
1248/// \returns An integer containing the comparison results.
1249static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1250 __m128d __b) {
1251 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1252}
1253
1254/// Compares the lower double-precision floating-point values in each of
1255/// the two 128-bit floating-point vectors of [2 x double] to determine if
1256/// the value in the first parameter is unequal to the corresponding value in
1257/// the second parameter.
1258///
1259/// The comparison returns 0 for false, 1 for true. If either value in a
1260/// comparison is NaN, returns 1.
1261///
1262/// \headerfile <x86intrin.h>
1263///
1264/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1265///
1266/// \param __a
1267/// A 128-bit vector of [2 x double]. The lower double-precision value is
1268/// compared to the lower double-precision value of \a __b.
1269/// \param __b
1270/// A 128-bit vector of [2 x double]. The lower double-precision value is
1271/// compared to the lower double-precision value of \a __a.
1272/// \returns An integer containing the comparison result.
1273static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1274 __m128d __b) {
1275 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1276}
1277
1278/// Converts the two double-precision floating-point elements of a
1279/// 128-bit vector of [2 x double] into two single-precision floating-point
1280/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1281/// The upper 64 bits of the result vector are set to zero.
1282///
1283/// \headerfile <x86intrin.h>
1284///
1285/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1286///
1287/// \param __a
1288/// A 128-bit vector of [2 x double].
1289/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1290/// converted values. The upper 64 bits are set to zero.
1291static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1292 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1293}
1294
1295/// Converts the lower two single-precision floating-point elements of a
1296/// 128-bit vector of [4 x float] into two double-precision floating-point
1297/// values, returned in a 128-bit vector of [2 x double]. The upper two
1298/// elements of the input vector are unused.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1303///
1304/// \param __a
1305/// A 128-bit vector of [4 x float]. The lower two single-precision
1306/// floating-point elements are converted to double-precision values. The
1307/// upper two elements are unused.
1308/// \returns A 128-bit vector of [2 x double] containing the converted values.
1309static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1311 return (__m128d) __builtin_convertvector(
1312 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1313}
1314
1315/// Converts the lower two integer elements of a 128-bit vector of
1316/// [4 x i32] into two double-precision floating-point values, returned in a
1317/// 128-bit vector of [2 x double].
1318///
1319/// The upper two elements of the input vector are unused.
1320///
1321/// \headerfile <x86intrin.h>
1322///
1323/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1324///
1325/// \param __a
1326/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1327/// converted to double-precision values.
1328///
1329/// The upper two elements are unused.
1330/// \returns A 128-bit vector of [2 x double] containing the converted values.
1331static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1333 return (__m128d) __builtin_convertvector(
1334 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1335}
1336
1337/// Converts the two double-precision floating-point elements of a
1338/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1339/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1340/// 64 bits of the result vector are set to zero.
1341///
1342/// If a converted value does not fit in a 32-bit integer, raises a
1343/// floating-point invalid exception. If the exception is masked, returns
1344/// the most negative integer.
1345///
1346/// \headerfile <x86intrin.h>
1347///
1348/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1349///
1350/// \param __a
1351/// A 128-bit vector of [2 x double].
1352/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1353/// converted values. The upper 64 bits are set to zero.
1354static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1355 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1356}
1357
1358/// Converts the low-order element of a 128-bit vector of [2 x double]
1359/// into a 32-bit signed integer value.
1360///
1361/// If the converted value does not fit in a 32-bit integer, raises a
1362/// floating-point invalid exception. If the exception is masked, returns
1363/// the most negative integer.
1364///
1365/// \headerfile <x86intrin.h>
1366///
1367/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1368///
1369/// \param __a
1370/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1371/// conversion.
1372/// \returns A 32-bit signed integer containing the converted value.
1373static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1374 return __builtin_ia32_cvtsd2si((__v2df)__a);
1375}
1376
1377/// Converts the lower double-precision floating-point element of a
1378/// 128-bit vector of [2 x double], in the second parameter, into a
1379/// single-precision floating-point value, returned in the lower 32 bits of a
1380/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1381/// copied from the upper 96 bits of the first parameter.
1382///
1383/// \headerfile <x86intrin.h>
1384///
1385/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1386///
1387/// \param __a
1388/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1389/// copied to the upper 96 bits of the result.
1390/// \param __b
1391/// A 128-bit vector of [2 x double]. The lower double-precision
1392/// floating-point element is used in the conversion.
1393/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1394/// converted value from the second parameter. The upper 96 bits are copied
1395/// from the upper 96 bits of the first parameter.
1396static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1397 __m128d __b) {
1398 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1399}
1400
1401/// Converts a 32-bit signed integer value, in the second parameter, into
1402/// a double-precision floating-point value, returned in the lower 64 bits of
1403/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1404/// are copied from the upper 64 bits of the first parameter.
1405///
1406/// \headerfile <x86intrin.h>
1407///
1408/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1409///
1410/// \param __a
1411/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1412/// copied to the upper 64 bits of the result.
1413/// \param __b
1414/// A 32-bit signed integer containing the value to be converted.
1415/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1416/// converted value from the second parameter. The upper 64 bits are copied
1417/// from the upper 64 bits of the first parameter.
1418static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1419_mm_cvtsi32_sd(__m128d __a, int __b) {
1420 __a[0] = __b;
1421 return __a;
1422}
1423
1424/// Converts the lower single-precision floating-point element of a
1425/// 128-bit vector of [4 x float], in the second parameter, into a
1426/// double-precision floating-point value, returned in the lower 64 bits of
1427/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1428/// are copied from the upper 64 bits of the first parameter.
1429///
1430/// \headerfile <x86intrin.h>
1431///
1432/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1433///
1434/// \param __a
1435/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1436/// copied to the upper 64 bits of the result.
1437/// \param __b
1438/// A 128-bit vector of [4 x float]. The lower single-precision
1439/// floating-point element is used in the conversion.
1440/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1441/// converted value from the second parameter. The upper 64 bits are copied
1442/// from the upper 64 bits of the first parameter.
1443static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1444_mm_cvtss_sd(__m128d __a, __m128 __b) {
1445 __a[0] = __b[0];
1446 return __a;
1447}
1448
1449/// Converts the two double-precision floating-point elements of a
1450/// 128-bit vector of [2 x double] into two signed truncated (rounded
1451/// toward zero) 32-bit integer values, returned in the lower 64 bits
1452/// of a 128-bit vector of [4 x i32].
1453///
1454/// If a converted value does not fit in a 32-bit integer, raises a
1455/// floating-point invalid exception. If the exception is masked, returns
1456/// the most negative integer.
1457///
1458/// \headerfile <x86intrin.h>
1459///
1460/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1461/// instruction.
1462///
1463/// \param __a
1464/// A 128-bit vector of [2 x double].
1465/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1466/// converted values. The upper 64 bits are set to zero.
1467static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1468 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1469}
1470
1471/// Converts the low-order element of a [2 x double] vector into a 32-bit
1472/// signed truncated (rounded toward zero) integer value.
1473///
1474/// If the converted value does not fit in a 32-bit integer, raises a
1475/// floating-point invalid exception. If the exception is masked, returns
1476/// the most negative integer.
1477///
1478/// \headerfile <x86intrin.h>
1479///
1480/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1481/// instruction.
1482///
1483/// \param __a
1484/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1485/// conversion.
1486/// \returns A 32-bit signed integer containing the converted value.
1487static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1488 return __builtin_ia32_cvttsd2si((__v2df)__a);
1489}
1490
1491/// Converts the two double-precision floating-point elements of a
1492/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1493/// returned in a 64-bit vector of [2 x i32].
1494///
1495/// If a converted value does not fit in a 32-bit integer, raises a
1496/// floating-point invalid exception. If the exception is masked, returns
1497/// the most negative integer.
1498///
1499/// \headerfile <x86intrin.h>
1500///
1501/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1502///
1503/// \param __a
1504/// A 128-bit vector of [2 x double].
1505/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1506static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1507 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1508}
1509
1510/// Converts the two double-precision floating-point elements of a
1511/// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1512/// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1513///
1514/// If a converted value does not fit in a 32-bit integer, raises a
1515/// floating-point invalid exception. If the exception is masked, returns
1516/// the most negative integer.
1517///
1518/// \headerfile <x86intrin.h>
1519///
1520/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1521///
1522/// \param __a
1523/// A 128-bit vector of [2 x double].
1524/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1525static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1526 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1527}
1528
1529/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1530/// [2 x i32] into two double-precision floating-point values, returned in a
1531/// 128-bit vector of [2 x double].
1532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1536///
1537/// \param __a
1538/// A 64-bit vector of [2 x i32].
1539/// \returns A 128-bit vector of [2 x double] containing the converted values.
1540static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1542 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1543}
1544
1545/// Returns the low-order element of a 128-bit vector of [2 x double] as
1546/// a double-precision floating-point value.
1547///
1548/// \headerfile <x86intrin.h>
1549///
1550/// This intrinsic has no corresponding instruction.
1551///
1552/// \param __a
1553/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1554/// \returns A double-precision floating-point value copied from the lower 64
1555/// bits of \a __a.
1556static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR
1558 return __a[0];
1559}
1560
1561/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1562/// memory location.
1563///
1564/// \headerfile <x86intrin.h>
1565///
1566/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1567///
1568/// \param __dp
1569/// A pointer to a 128-bit memory location. The address of the memory
1570/// location has to be 16-byte aligned.
1571/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1572static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1573 return *(const __m128d *)__dp;
1574}
1575
1576/// Loads a double-precision floating-point value from a specified memory
1577/// location and duplicates it to both vector elements of a 128-bit vector of
1578/// [2 x double].
1579///
1580/// \headerfile <x86intrin.h>
1581///
1582/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1583///
1584/// \param __dp
1585/// A pointer to a memory location containing a double-precision value.
1586/// \returns A 128-bit vector of [2 x double] containing the loaded and
1587/// duplicated values.
1588static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1589 struct __mm_load1_pd_struct {
1590 double __u;
1591 } __attribute__((__packed__, __may_alias__));
1592 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1593 return __extension__(__m128d){__u, __u};
1594}
1595
1596#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1597
1598/// Loads two double-precision values, in reverse order, from an aligned
1599/// memory location into a 128-bit vector of [2 x double].
1600///
1601/// \headerfile <x86intrin.h>
1602///
1603/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1604/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1605/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1606///
1607/// \param __dp
1608/// A 16-byte aligned pointer to an array of double-precision values to be
1609/// loaded in reverse order.
1610/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1611/// values.
1612static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1613 __m128d __u = *(const __m128d *)__dp;
1614 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1615}
1616
1617/// Loads a 128-bit floating-point vector of [2 x double] from an
1618/// unaligned memory location.
1619///
1620/// \headerfile <x86intrin.h>
1621///
1622/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1623///
1624/// \param __dp
1625/// A pointer to a 128-bit memory location. The address of the memory
1626/// location does not have to be aligned.
1627/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1628static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1629 struct __loadu_pd {
1630 __m128d_u __v;
1631 } __attribute__((__packed__, __may_alias__));
1632 return ((const struct __loadu_pd *)__dp)->__v;
1633}
1634
1635/// Loads a 64-bit integer value to the low element of a 128-bit integer
1636/// vector and clears the upper element.
1637///
1638/// \headerfile <x86intrin.h>
1639///
1640/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1641///
1642/// \param __a
1643/// A pointer to a 64-bit memory location. The address of the memory
1644/// location does not have to be aligned.
1645/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1646static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1647 struct __loadu_si64 {
1648 long long __v;
1649 } __attribute__((__packed__, __may_alias__));
1650 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1651 return __extension__(__m128i)(__v2di){__u, 0LL};
1652}
1653
1654/// Loads a 32-bit integer value to the low element of a 128-bit integer
1655/// vector and clears the upper element.
1656///
1657/// \headerfile <x86intrin.h>
1658///
1659/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1660///
1661/// \param __a
1662/// A pointer to a 32-bit memory location. The address of the memory
1663/// location does not have to be aligned.
1664/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1665static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1666 struct __loadu_si32 {
1667 int __v;
1668 } __attribute__((__packed__, __may_alias__));
1669 int __u = ((const struct __loadu_si32 *)__a)->__v;
1670 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1671}
1672
1673/// Loads a 16-bit integer value to the low element of a 128-bit integer
1674/// vector and clears the upper element.
1675///
1676/// \headerfile <x86intrin.h>
1677///
1678/// This intrinsic does not correspond to a specific instruction.
1679///
1680/// \param __a
1681/// A pointer to a 16-bit memory location. The address of the memory
1682/// location does not have to be aligned.
1683/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1684static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1685 struct __loadu_si16 {
1686 short __v;
1687 } __attribute__((__packed__, __may_alias__));
1688 short __u = ((const struct __loadu_si16 *)__a)->__v;
1689 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1690}
1691
1692/// Loads a 64-bit double-precision value to the low element of a
1693/// 128-bit integer vector and clears the upper element.
1694///
1695/// \headerfile <x86intrin.h>
1696///
1697/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1698///
1699/// \param __dp
1700/// A pointer to a memory location containing a double-precision value.
1701/// The address of the memory location does not have to be aligned.
1702/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1703static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1704 struct __mm_load_sd_struct {
1705 double __u;
1706 } __attribute__((__packed__, __may_alias__));
1707 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1708 return __extension__(__m128d){__u, 0};
1709}
1710
1711/// Loads a double-precision value into the high-order bits of a 128-bit
1712/// vector of [2 x double]. The low-order bits are copied from the low-order
1713/// bits of the first operand.
1714///
1715/// \headerfile <x86intrin.h>
1716///
1717/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1718///
1719/// \param __a
1720/// A 128-bit vector of [2 x double]. \n
1721/// Bits [63:0] are written to bits [63:0] of the result.
1722/// \param __dp
1723/// A pointer to a 64-bit memory location containing a double-precision
1724/// floating-point value that is loaded. The loaded value is written to bits
1725/// [127:64] of the result. The address of the memory location does not have
1726/// to be aligned.
1727/// \returns A 128-bit vector of [2 x double] containing the moved values.
1728static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1729 double const *__dp) {
1730 struct __mm_loadh_pd_struct {
1731 double __u;
1732 } __attribute__((__packed__, __may_alias__));
1733 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1734 return __extension__(__m128d){__a[0], __u};
1735}
1736
1737/// Loads a double-precision value into the low-order bits of a 128-bit
1738/// vector of [2 x double]. The high-order bits are copied from the
1739/// high-order bits of the first operand.
1740///
1741/// \headerfile <x86intrin.h>
1742///
1743/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1744///
1745/// \param __a
1746/// A 128-bit vector of [2 x double]. \n
1747/// Bits [127:64] are written to bits [127:64] of the result.
1748/// \param __dp
1749/// A pointer to a 64-bit memory location containing a double-precision
1750/// floating-point value that is loaded. The loaded value is written to bits
1751/// [63:0] of the result. The address of the memory location does not have to
1752/// be aligned.
1753/// \returns A 128-bit vector of [2 x double] containing the moved values.
1754static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1755 double const *__dp) {
1756 struct __mm_loadl_pd_struct {
1757 double __u;
1758 } __attribute__((__packed__, __may_alias__));
1759 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1760 return __extension__(__m128d){__u, __a[1]};
1761}
1762
1763/// Constructs a 128-bit floating-point vector of [2 x double] with
1764/// unspecified content. This could be used as an argument to another
1765/// intrinsic function where the argument is required but the value is not
1766/// actually used.
1767///
1768/// \headerfile <x86intrin.h>
1769///
1770/// This intrinsic has no corresponding instruction.
1771///
1772/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1773/// content.
1774static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1775 return (__m128d)__builtin_ia32_undef128();
1776}
1777
1778/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1779/// 64 bits of the vector are initialized with the specified double-precision
1780/// floating-point value. The upper 64 bits are set to zero.
1781///
1782/// \headerfile <x86intrin.h>
1783///
1784/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1785///
1786/// \param __w
1787/// A double-precision floating-point value used to initialize the lower 64
1788/// bits of the result.
1789/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1790/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1791/// set to zero.
1792static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) {
1793 return __extension__(__m128d){__w, 0.0};
1794}
1795
1796/// Constructs a 128-bit floating-point vector of [2 x double], with each
1797/// of the two double-precision floating-point vector elements set to the
1798/// specified double-precision floating-point value.
1799///
1800/// \headerfile <x86intrin.h>
1801///
1802/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1803///
1804/// \param __w
1805/// A double-precision floating-point value used to initialize each vector
1806/// element of the result.
1807/// \returns An initialized 128-bit floating-point vector of [2 x double].
1808static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) {
1809 return __extension__(__m128d){__w, __w};
1810}
1811
1812/// Constructs a 128-bit floating-point vector of [2 x double], with each
1813/// of the two double-precision floating-point vector elements set to the
1814/// specified double-precision floating-point value.
1815///
1816/// \headerfile <x86intrin.h>
1817///
1818/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1819///
1820/// \param __w
1821/// A double-precision floating-point value used to initialize each vector
1822/// element of the result.
1823/// \returns An initialized 128-bit floating-point vector of [2 x double].
1824static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) {
1825 return _mm_set1_pd(__w);
1826}
1827
1828/// Constructs a 128-bit floating-point vector of [2 x double]
1829/// initialized with the specified double-precision floating-point values.
1830///
1831/// \headerfile <x86intrin.h>
1832///
1833/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1834///
1835/// \param __w
1836/// A double-precision floating-point value used to initialize the upper 64
1837/// bits of the result.
1838/// \param __x
1839/// A double-precision floating-point value used to initialize the lower 64
1840/// bits of the result.
1841/// \returns An initialized 128-bit floating-point vector of [2 x double].
1842static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w,
1843 double __x) {
1844 return __extension__(__m128d){__x, __w};
1845}
1846
1847/// Constructs a 128-bit floating-point vector of [2 x double],
1848/// initialized in reverse order with the specified double-precision
1849/// floating-point values.
1850///
1851/// \headerfile <x86intrin.h>
1852///
1853/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1854///
1855/// \param __w
1856/// A double-precision floating-point value used to initialize the lower 64
1857/// bits of the result.
1858/// \param __x
1859/// A double-precision floating-point value used to initialize the upper 64
1860/// bits of the result.
1861/// \returns An initialized 128-bit floating-point vector of [2 x double].
1862static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w,
1863 double __x) {
1864 return __extension__(__m128d){__w, __x};
1865}
1866
1867/// Constructs a 128-bit floating-point vector of [2 x double]
1868/// initialized to zero.
1869///
1870/// \headerfile <x86intrin.h>
1871///
1872/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1873///
1874/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1875/// all elements set to zero.
1876static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) {
1877 return __extension__(__m128d){0.0, 0.0};
1878}
1879
1880/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1881/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1882/// 64 bits are set to the upper 64 bits of the first parameter.
1883///
1884/// \headerfile <x86intrin.h>
1885///
1886/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1887///
1888/// \param __a
1889/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1890/// upper 64 bits of the result.
1891/// \param __b
1892/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1893/// lower 64 bits of the result.
1894/// \returns A 128-bit vector of [2 x double] containing the moved values.
1895static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1896_mm_move_sd(__m128d __a, __m128d __b) {
1897 __a[0] = __b[0];
1898 return __a;
1899}
1900
1901/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1902/// memory location.
1903///
1904/// \headerfile <x86intrin.h>
1905///
1906/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1907///
1908/// \param __dp
1909/// A pointer to a 64-bit memory location.
1910/// \param __a
1911/// A 128-bit vector of [2 x double] containing the value to be stored.
1912static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1913 __m128d __a) {
1914 struct __mm_store_sd_struct {
1915 double __u;
1916 } __attribute__((__packed__, __may_alias__));
1917 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1918}
1919
1920/// Moves packed double-precision values from a 128-bit vector of
1921/// [2 x double] to a memory location.
1922///
1923/// \headerfile <x86intrin.h>
1924///
1925/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1926///
1927/// \param __dp
1928/// A pointer to an aligned memory location that can store two
1929/// double-precision values.
1930/// \param __a
1931/// A packed 128-bit vector of [2 x double] containing the values to be
1932/// moved.
1933static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1934 __m128d __a) {
1935 *(__m128d *)__dp = __a;
1936}
1937
1938/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1939/// the upper and lower 64 bits of a memory location.
1940///
1941/// \headerfile <x86intrin.h>
1942///
1943/// This intrinsic corresponds to the
1944/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1945///
1946/// \param __dp
1947/// A pointer to a memory location that can store two double-precision
1948/// values.
1949/// \param __a
1950/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1951/// of the values in \a __dp.
1952static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1953 __m128d __a) {
1954 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1955 _mm_store_pd(__dp, __a);
1956}
1957
1958/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1959/// the upper and lower 64 bits of a memory location.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic corresponds to the
1964/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1965///
1966/// \param __dp
1967/// A pointer to a memory location that can store two double-precision
1968/// values.
1969/// \param __a
1970/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1971/// of the values in \a __dp.
1972static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1973 __m128d __a) {
1974 _mm_store1_pd(__dp, __a);
1975}
1976
1977/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1978/// location.
1979///
1980/// \headerfile <x86intrin.h>
1981///
1982/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1983///
1984/// \param __dp
1985/// A pointer to a 128-bit memory location. The address of the memory
1986/// location does not have to be aligned.
1987/// \param __a
1988/// A 128-bit vector of [2 x double] containing the values to be stored.
1989static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1990 __m128d __a) {
1991 struct __storeu_pd {
1992 __m128d_u __v;
1993 } __attribute__((__packed__, __may_alias__));
1994 ((struct __storeu_pd *)__dp)->__v = __a;
1995}
1996
1997/// Stores two double-precision values, in reverse order, from a 128-bit
1998/// vector of [2 x double] to a 16-byte aligned memory location.
1999///
2000/// \headerfile <x86intrin.h>
2001///
2002/// This intrinsic corresponds to a shuffling instruction followed by a
2003/// <c> VMOVAPD / MOVAPD </c> instruction.
2004///
2005/// \param __dp
2006/// A pointer to a 16-byte aligned memory location that can store two
2007/// double-precision values.
2008/// \param __a
2009/// A 128-bit vector of [2 x double] containing the values to be reversed and
2010/// stored.
2011static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2012 __m128d __a) {
2013 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2014 *(__m128d *)__dp = __a;
2015}
2016
2017/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2018/// memory location.
2019///
2020/// \headerfile <x86intrin.h>
2021///
2022/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2023///
2024/// \param __dp
2025/// A pointer to a 64-bit memory location.
2026/// \param __a
2027/// A 128-bit vector of [2 x double] containing the value to be stored.
2028static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2029 __m128d __a) {
2030 struct __mm_storeh_pd_struct {
2031 double __u;
2032 } __attribute__((__packed__, __may_alias__));
2033 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2034}
2035
2036/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2037/// memory location.
2038///
2039/// \headerfile <x86intrin.h>
2040///
2041/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2042///
2043/// \param __dp
2044/// A pointer to a 64-bit memory location.
2045/// \param __a
2046/// A 128-bit vector of [2 x double] containing the value to be stored.
2047static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2048 __m128d __a) {
2049 struct __mm_storeh_pd_struct {
2050 double __u;
2051 } __attribute__((__packed__, __may_alias__));
2052 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2053}
2054
2055/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2056/// saving the lower 8 bits of each sum in the corresponding element of a
2057/// 128-bit result vector of [16 x i8].
2058///
2059/// The integer elements of both parameters can be either signed or unsigned.
2060///
2061/// \headerfile <x86intrin.h>
2062///
2063/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2064///
2065/// \param __a
2066/// A 128-bit vector of [16 x i8].
2067/// \param __b
2068/// A 128-bit vector of [16 x i8].
2069/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2070/// parameters.
2071static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2072 __m128i __b) {
2073 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2074}
2075
2076/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2077/// saving the lower 16 bits of each sum in the corresponding element of a
2078/// 128-bit result vector of [8 x i16].
2079///
2080/// The integer elements of both parameters can be either signed or unsigned.
2081///
2082/// \headerfile <x86intrin.h>
2083///
2084/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2085///
2086/// \param __a
2087/// A 128-bit vector of [8 x i16].
2088/// \param __b
2089/// A 128-bit vector of [8 x i16].
2090/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2091/// parameters.
2092static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2093 __m128i __b) {
2094 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2095}
2096
2097/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2098/// saving the lower 32 bits of each sum in the corresponding element of a
2099/// 128-bit result vector of [4 x i32].
2100///
2101/// The integer elements of both parameters can be either signed or unsigned.
2102///
2103/// \headerfile <x86intrin.h>
2104///
2105/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2106///
2107/// \param __a
2108/// A 128-bit vector of [4 x i32].
2109/// \param __b
2110/// A 128-bit vector of [4 x i32].
2111/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2112/// parameters.
2113static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2114_mm_add_epi32(__m128i __a, __m128i __b) {
2115 return (__m128i)((__v4su)__a + (__v4su)__b);
2116}
2117
2118/// Adds two signed or unsigned 64-bit integer values, returning the
2119/// lower 64 bits of the sum.
2120///
2121/// \headerfile <x86intrin.h>
2122///
2123/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2124///
2125/// \param __a
2126/// A 64-bit integer.
2127/// \param __b
2128/// A 64-bit integer.
2129/// \returns A 64-bit integer containing the sum of both parameters.
2130static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2131 return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2132}
2133
2134/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2135/// saving the lower 64 bits of each sum in the corresponding element of a
2136/// 128-bit result vector of [2 x i64].
2137///
2138/// The integer elements of both parameters can be either signed or unsigned.
2139///
2140/// \headerfile <x86intrin.h>
2141///
2142/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2143///
2144/// \param __a
2145/// A 128-bit vector of [2 x i64].
2146/// \param __b
2147/// A 128-bit vector of [2 x i64].
2148/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2149/// parameters.
2150static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2151_mm_add_epi64(__m128i __a, __m128i __b) {
2152 return (__m128i)((__v2du)__a + (__v2du)__b);
2153}
2154
2155/// Adds, with saturation, the corresponding elements of two 128-bit
2156/// signed [16 x i8] vectors, saving each sum in the corresponding element
2157/// of a 128-bit result vector of [16 x i8].
2158///
2159/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2160/// less than 0x80 are saturated to 0x80.
2161///
2162/// \headerfile <x86intrin.h>
2163///
2164/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2165///
2166/// \param __a
2167/// A 128-bit signed [16 x i8] vector.
2168/// \param __b
2169/// A 128-bit signed [16 x i8] vector.
2170/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2171/// both parameters.
2172static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2173 __m128i __b) {
2174 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2175}
2176
2177/// Adds, with saturation, the corresponding elements of two 128-bit
2178/// signed [8 x i16] vectors, saving each sum in the corresponding element
2179/// of a 128-bit result vector of [8 x i16].
2180///
2181/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2182/// less than 0x8000 are saturated to 0x8000.
2183///
2184/// \headerfile <x86intrin.h>
2185///
2186/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2187///
2188/// \param __a
2189/// A 128-bit signed [8 x i16] vector.
2190/// \param __b
2191/// A 128-bit signed [8 x i16] vector.
2192/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2193/// both parameters.
2194static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2195 __m128i __b) {
2196 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2197}
2198
2199/// Adds, with saturation, the corresponding elements of two 128-bit
2200/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2201/// of a 128-bit result vector of [16 x i8].
2202///
2203/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2204/// saturated to 0x00.
2205///
2206/// \headerfile <x86intrin.h>
2207///
2208/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2209///
2210/// \param __a
2211/// A 128-bit unsigned [16 x i8] vector.
2212/// \param __b
2213/// A 128-bit unsigned [16 x i8] vector.
2214/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2215/// of both parameters.
2216static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2217 __m128i __b) {
2218 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2219}
2220
2221/// Adds, with saturation, the corresponding elements of two 128-bit
2222/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2223/// of a 128-bit result vector of [8 x i16].
2224///
2225/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2226/// are saturated to 0x0000.
2227///
2228/// \headerfile <x86intrin.h>
2229///
2230/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2231///
2232/// \param __a
2233/// A 128-bit unsigned [8 x i16] vector.
2234/// \param __b
2235/// A 128-bit unsigned [8 x i16] vector.
2236/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2237/// of both parameters.
2238static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2239 __m128i __b) {
2240 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2241}
2242
2243/// Computes the rounded averages of corresponding elements of two
2244/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2245/// corresponding element of a 128-bit result vector of [16 x i8].
2246///
2247/// \headerfile <x86intrin.h>
2248///
2249/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2250///
2251/// \param __a
2252/// A 128-bit unsigned [16 x i8] vector.
2253/// \param __b
2254/// A 128-bit unsigned [16 x i8] vector.
2255/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2256/// averages of both parameters.
2257static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2258 __m128i __b) {
2259 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2260}
2261
2262/// Computes the rounded averages of corresponding elements of two
2263/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2264/// corresponding element of a 128-bit result vector of [8 x i16].
2265///
2266/// \headerfile <x86intrin.h>
2267///
2268/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2269///
2270/// \param __a
2271/// A 128-bit unsigned [8 x i16] vector.
2272/// \param __b
2273/// A 128-bit unsigned [8 x i16] vector.
2274/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2275/// averages of both parameters.
2276static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2277 __m128i __b) {
2278 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2279}
2280
2281/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2282/// vectors, producing eight intermediate 32-bit signed integer products, and
2283/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2284/// [4 x i32] vector.
2285///
2286/// For example, bits [15:0] of both parameters are multiplied producing a
2287/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2288/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2289/// of the result.
2290///
2291/// \headerfile <x86intrin.h>
2292///
2293/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2294///
2295/// \param __a
2296/// A 128-bit signed [8 x i16] vector.
2297/// \param __b
2298/// A 128-bit signed [8 x i16] vector.
2299/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2300/// of both parameters.
2301static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2302 __m128i __b) {
2303 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2304}
2305
2306/// Compares corresponding elements of two 128-bit signed [8 x i16]
2307/// vectors, saving the greater value from each comparison in the
2308/// corresponding element of a 128-bit result vector of [8 x i16].
2309///
2310/// \headerfile <x86intrin.h>
2311///
2312/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2313///
2314/// \param __a
2315/// A 128-bit signed [8 x i16] vector.
2316/// \param __b
2317/// A 128-bit signed [8 x i16] vector.
2318/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2319/// each comparison.
2320static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2321 __m128i __b) {
2322 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2323}
2324
2325/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2326/// vectors, saving the greater value from each comparison in the
2327/// corresponding element of a 128-bit result vector of [16 x i8].
2328///
2329/// \headerfile <x86intrin.h>
2330///
2331/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2332///
2333/// \param __a
2334/// A 128-bit unsigned [16 x i8] vector.
2335/// \param __b
2336/// A 128-bit unsigned [16 x i8] vector.
2337/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2338/// each comparison.
2339static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2340 __m128i __b) {
2341 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2342}
2343
2344/// Compares corresponding elements of two 128-bit signed [8 x i16]
2345/// vectors, saving the smaller value from each comparison in the
2346/// corresponding element of a 128-bit result vector of [8 x i16].
2347///
2348/// \headerfile <x86intrin.h>
2349///
2350/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2351///
2352/// \param __a
2353/// A 128-bit signed [8 x i16] vector.
2354/// \param __b
2355/// A 128-bit signed [8 x i16] vector.
2356/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2357/// each comparison.
2358static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2359 __m128i __b) {
2360 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2361}
2362
2363/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2364/// vectors, saving the smaller value from each comparison in the
2365/// corresponding element of a 128-bit result vector of [16 x i8].
2366///
2367/// \headerfile <x86intrin.h>
2368///
2369/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2370///
2371/// \param __a
2372/// A 128-bit unsigned [16 x i8] vector.
2373/// \param __b
2374/// A 128-bit unsigned [16 x i8] vector.
2375/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2376/// each comparison.
2377static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2378 __m128i __b) {
2379 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2380}
2381
2382/// Multiplies the corresponding elements of two signed [8 x i16]
2383/// vectors, saving the upper 16 bits of each 32-bit product in the
2384/// corresponding element of a 128-bit signed [8 x i16] result vector.
2385///
2386/// \headerfile <x86intrin.h>
2387///
2388/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2389///
2390/// \param __a
2391/// A 128-bit signed [8 x i16] vector.
2392/// \param __b
2393/// A 128-bit signed [8 x i16] vector.
2394/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2395/// each of the eight 32-bit products.
2396static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2397 __m128i __b) {
2398 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2399}
2400
2401/// Multiplies the corresponding elements of two unsigned [8 x i16]
2402/// vectors, saving the upper 16 bits of each 32-bit product in the
2403/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2404///
2405/// \headerfile <x86intrin.h>
2406///
2407/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2408///
2409/// \param __a
2410/// A 128-bit unsigned [8 x i16] vector.
2411/// \param __b
2412/// A 128-bit unsigned [8 x i16] vector.
2413/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2414/// of each of the eight 32-bit products.
2415static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2416 __m128i __b) {
2417 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2418}
2419
2420/// Multiplies the corresponding elements of two signed [8 x i16]
2421/// vectors, saving the lower 16 bits of each 32-bit product in the
2422/// corresponding element of a 128-bit signed [8 x i16] result vector.
2423///
2424/// \headerfile <x86intrin.h>
2425///
2426/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2427///
2428/// \param __a
2429/// A 128-bit signed [8 x i16] vector.
2430/// \param __b
2431/// A 128-bit signed [8 x i16] vector.
2432/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2433/// each of the eight 32-bit products.
2434static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2435 __m128i __b) {
2436 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2437}
2438
2439/// Multiplies 32-bit unsigned integer values contained in the lower bits
2440/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2441/// product.
2442///
2443/// \headerfile <x86intrin.h>
2444///
2445/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2446///
2447/// \param __a
2448/// A 64-bit integer containing one of the source operands.
2449/// \param __b
2450/// A 64-bit integer containing one of the source operands.
2451/// \returns A 64-bit integer vector containing the product of both operands.
2452static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2453 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2454 (__v4si)__anyext128(__b)));
2455}
2456
2457/// Multiplies 32-bit unsigned integer values contained in the lower
2458/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2459/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2460///
2461/// \headerfile <x86intrin.h>
2462///
2463/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2464///
2465/// \param __a
2466/// A [2 x i64] vector containing one of the source operands.
2467/// \param __b
2468/// A [2 x i64] vector containing one of the source operands.
2469/// \returns A [2 x i64] vector containing the product of both operands.
2470static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2471 __m128i __b) {
2472 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2473}
2474
2475/// Computes the absolute differences of corresponding 8-bit integer
2476/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2477/// separately sums the second 8 absolute differences. Packs these two
2478/// unsigned 16-bit integer sums into the upper and lower elements of a
2479/// [2 x i64] vector.
2480///
2481/// \headerfile <x86intrin.h>
2482///
2483/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2484///
2485/// \param __a
2486/// A 128-bit integer vector containing one of the source operands.
2487/// \param __b
2488/// A 128-bit integer vector containing one of the source operands.
2489/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2490/// differences between both operands.
2491static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2492 __m128i __b) {
2493 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2494}
2495
2496/// Subtracts the corresponding 8-bit integer values in the operands.
2497///
2498/// \headerfile <x86intrin.h>
2499///
2500/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2501///
2502/// \param __a
2503/// A 128-bit integer vector containing the minuends.
2504/// \param __b
2505/// A 128-bit integer vector containing the subtrahends.
2506/// \returns A 128-bit integer vector containing the differences of the values
2507/// in the operands.
2508static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2509 __m128i __b) {
2510 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2511}
2512
2513/// Subtracts the corresponding 16-bit integer values in the operands.
2514///
2515/// \headerfile <x86intrin.h>
2516///
2517/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2518///
2519/// \param __a
2520/// A 128-bit integer vector containing the minuends.
2521/// \param __b
2522/// A 128-bit integer vector containing the subtrahends.
2523/// \returns A 128-bit integer vector containing the differences of the values
2524/// in the operands.
2525static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2526 __m128i __b) {
2527 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2528}
2529
2530/// Subtracts the corresponding 32-bit integer values in the operands.
2531///
2532/// \headerfile <x86intrin.h>
2533///
2534/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2535///
2536/// \param __a
2537/// A 128-bit integer vector containing the minuends.
2538/// \param __b
2539/// A 128-bit integer vector containing the subtrahends.
2540/// \returns A 128-bit integer vector containing the differences of the values
2541/// in the operands.
2542static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2543_mm_sub_epi32(__m128i __a, __m128i __b) {
2544 return (__m128i)((__v4su)__a - (__v4su)__b);
2545}
2546
2547/// Subtracts signed or unsigned 64-bit integer values and writes the
2548/// difference to the corresponding bits in the destination.
2549///
2550/// \headerfile <x86intrin.h>
2551///
2552/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2553///
2554/// \param __a
2555/// A 64-bit integer vector containing the minuend.
2556/// \param __b
2557/// A 64-bit integer vector containing the subtrahend.
2558/// \returns A 64-bit integer vector containing the difference of the values in
2559/// the operands.
2560static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2561 return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2562}
2563
2564/// Subtracts the corresponding elements of two [2 x i64] vectors.
2565///
2566/// \headerfile <x86intrin.h>
2567///
2568/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2569///
2570/// \param __a
2571/// A 128-bit integer vector containing the minuends.
2572/// \param __b
2573/// A 128-bit integer vector containing the subtrahends.
2574/// \returns A 128-bit integer vector containing the differences of the values
2575/// in the operands.
2576static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2577_mm_sub_epi64(__m128i __a, __m128i __b) {
2578 return (__m128i)((__v2du)__a - (__v2du)__b);
2579}
2580
2581/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2582/// the input and returns the differences in the corresponding bytes in the
2583/// destination.
2584///
2585/// Differences greater than 0x7F are saturated to 0x7F, and differences
2586/// less than 0x80 are saturated to 0x80.
2587///
2588/// \headerfile <x86intrin.h>
2589///
2590/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2591///
2592/// \param __a
2593/// A 128-bit integer vector containing the minuends.
2594/// \param __b
2595/// A 128-bit integer vector containing the subtrahends.
2596/// \returns A 128-bit integer vector containing the differences of the values
2597/// in the operands.
2598static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2599 __m128i __b) {
2600 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2601}
2602
2603/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2604/// the input and returns the differences in the corresponding bytes in the
2605/// destination.
2606///
2607/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2608/// than 0x8000 are saturated to 0x8000.
2609///
2610/// \headerfile <x86intrin.h>
2611///
2612/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2613///
2614/// \param __a
2615/// A 128-bit integer vector containing the minuends.
2616/// \param __b
2617/// A 128-bit integer vector containing the subtrahends.
2618/// \returns A 128-bit integer vector containing the differences of the values
2619/// in the operands.
2620static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2621 __m128i __b) {
2622 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2623}
2624
2625/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2626/// the input and returns the differences in the corresponding bytes in the
2627/// destination.
2628///
2629/// Differences less than 0x00 are saturated to 0x00.
2630///
2631/// \headerfile <x86intrin.h>
2632///
2633/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2634///
2635/// \param __a
2636/// A 128-bit integer vector containing the minuends.
2637/// \param __b
2638/// A 128-bit integer vector containing the subtrahends.
2639/// \returns A 128-bit integer vector containing the unsigned integer
2640/// differences of the values in the operands.
2641static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2642 __m128i __b) {
2643 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2644}
2645
2646/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2647/// the input and returns the differences in the corresponding bytes in the
2648/// destination.
2649///
2650/// Differences less than 0x0000 are saturated to 0x0000.
2651///
2652/// \headerfile <x86intrin.h>
2653///
2654/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2655///
2656/// \param __a
2657/// A 128-bit integer vector containing the minuends.
2658/// \param __b
2659/// A 128-bit integer vector containing the subtrahends.
2660/// \returns A 128-bit integer vector containing the unsigned integer
2661/// differences of the values in the operands.
2662static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2663 __m128i __b) {
2664 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2665}
2666
2667/// Performs a bitwise AND of two 128-bit integer vectors.
2668///
2669/// \headerfile <x86intrin.h>
2670///
2671/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2672///
2673/// \param __a
2674/// A 128-bit integer vector containing one of the source operands.
2675/// \param __b
2676/// A 128-bit integer vector containing one of the source operands.
2677/// \returns A 128-bit integer vector containing the bitwise AND of the values
2678/// in both operands.
2679static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2680 __m128i __b) {
2681 return (__m128i)((__v2du)__a & (__v2du)__b);
2682}
2683
2684/// Performs a bitwise AND of two 128-bit integer vectors, using the
2685/// one's complement of the values contained in the first source operand.
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2690///
2691/// \param __a
2692/// A 128-bit vector containing the left source operand. The one's complement
2693/// of this value is used in the bitwise AND.
2694/// \param __b
2695/// A 128-bit vector containing the right source operand.
2696/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2697/// complement of the first operand and the values in the second operand.
2698static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2699 __m128i __b) {
2700 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2701}
2702/// Performs a bitwise OR of two 128-bit integer vectors.
2703///
2704/// \headerfile <x86intrin.h>
2705///
2706/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2707///
2708/// \param __a
2709/// A 128-bit integer vector containing one of the source operands.
2710/// \param __b
2711/// A 128-bit integer vector containing one of the source operands.
2712/// \returns A 128-bit integer vector containing the bitwise OR of the values
2713/// in both operands.
2714static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2715 __m128i __b) {
2716 return (__m128i)((__v2du)__a | (__v2du)__b);
2717}
2718
2719/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2720///
2721/// \headerfile <x86intrin.h>
2722///
2723/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2724///
2725/// \param __a
2726/// A 128-bit integer vector containing one of the source operands.
2727/// \param __b
2728/// A 128-bit integer vector containing one of the source operands.
2729/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2730/// values in both operands.
2731static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2732 __m128i __b) {
2733 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2734}
2735
2736/// Left-shifts the 128-bit integer vector operand by the specified
2737/// number of bytes. Low-order bits are cleared.
2738///
2739/// \headerfile <x86intrin.h>
2740///
2741/// \code
2742/// __m128i _mm_slli_si128(__m128i a, const int imm);
2743/// \endcode
2744///
2745/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2746///
2747/// \param a
2748/// A 128-bit integer vector containing the source operand.
2749/// \param imm
2750/// An immediate value specifying the number of bytes to left-shift operand
2751/// \a a.
2752/// \returns A 128-bit integer vector containing the left-shifted value.
2753#define _mm_slli_si128(a, imm) \
2754 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2755 (int)(imm)))
2756
2757#define _mm_bslli_si128(a, imm) \
2758 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2759 (int)(imm)))
2760
2761/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2762/// by the specified number of bits. Low-order bits are cleared.
2763///
2764/// \headerfile <x86intrin.h>
2765///
2766/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2767///
2768/// \param __a
2769/// A 128-bit integer vector containing the source operand.
2770/// \param __count
2771/// An integer value specifying the number of bits to left-shift each value
2772/// in operand \a __a.
2773/// \returns A 128-bit integer vector containing the left-shifted values.
2774static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2775 int __count) {
2776 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2777}
2778
2779/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2780/// by the specified number of bits. Low-order bits are cleared.
2781///
2782/// \headerfile <x86intrin.h>
2783///
2784/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2785///
2786/// \param __a
2787/// A 128-bit integer vector containing the source operand.
2788/// \param __count
2789/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2790/// to left-shift each value in operand \a __a.
2791/// \returns A 128-bit integer vector containing the left-shifted values.
2792static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2793 __m128i __count) {
2794 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2795}
2796
2797/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2798/// by the specified number of bits. Low-order bits are cleared.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2803///
2804/// \param __a
2805/// A 128-bit integer vector containing the source operand.
2806/// \param __count
2807/// An integer value specifying the number of bits to left-shift each value
2808/// in operand \a __a.
2809/// \returns A 128-bit integer vector containing the left-shifted values.
2810static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2811 int __count) {
2812 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2813}
2814
2815/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2816/// by the specified number of bits. Low-order bits are cleared.
2817///
2818/// \headerfile <x86intrin.h>
2819///
2820/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2821///
2822/// \param __a
2823/// A 128-bit integer vector containing the source operand.
2824/// \param __count
2825/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2826/// to left-shift each value in operand \a __a.
2827/// \returns A 128-bit integer vector containing the left-shifted values.
2828static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2829 __m128i __count) {
2830 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2831}
2832
2833/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2834/// by the specified number of bits. Low-order bits are cleared.
2835///
2836/// \headerfile <x86intrin.h>
2837///
2838/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2839///
2840/// \param __a
2841/// A 128-bit integer vector containing the source operand.
2842/// \param __count
2843/// An integer value specifying the number of bits to left-shift each value
2844/// in operand \a __a.
2845/// \returns A 128-bit integer vector containing the left-shifted values.
2846static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2847 int __count) {
2848 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2849}
2850
2851/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2852/// by the specified number of bits. Low-order bits are cleared.
2853///
2854/// \headerfile <x86intrin.h>
2855///
2856/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2857///
2858/// \param __a
2859/// A 128-bit integer vector containing the source operand.
2860/// \param __count
2861/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2862/// to left-shift each value in operand \a __a.
2863/// \returns A 128-bit integer vector containing the left-shifted values.
2864static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2865 __m128i __count) {
2866 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2867}
2868
2869/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2870/// by the specified number of bits. High-order bits are filled with the sign
2871/// bit of the initial value.
2872///
2873/// \headerfile <x86intrin.h>
2874///
2875/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2876///
2877/// \param __a
2878/// A 128-bit integer vector containing the source operand.
2879/// \param __count
2880/// An integer value specifying the number of bits to right-shift each value
2881/// in operand \a __a.
2882/// \returns A 128-bit integer vector containing the right-shifted values.
2883static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2884 int __count) {
2885 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2886}
2887
2888/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2889/// by the specified number of bits. High-order bits are filled with the sign
2890/// bit of the initial value.
2891///
2892/// \headerfile <x86intrin.h>
2893///
2894/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2895///
2896/// \param __a
2897/// A 128-bit integer vector containing the source operand.
2898/// \param __count
2899/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2900/// to right-shift each value in operand \a __a.
2901/// \returns A 128-bit integer vector containing the right-shifted values.
2902static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2903 __m128i __count) {
2904 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2905}
2906
2907/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2908/// by the specified number of bits. High-order bits are filled with the sign
2909/// bit of the initial value.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2914///
2915/// \param __a
2916/// A 128-bit integer vector containing the source operand.
2917/// \param __count
2918/// An integer value specifying the number of bits to right-shift each value
2919/// in operand \a __a.
2920/// \returns A 128-bit integer vector containing the right-shifted values.
2921static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2922 int __count) {
2923 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2924}
2925
2926/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2927/// by the specified number of bits. High-order bits are filled with the sign
2928/// bit of the initial value.
2929///
2930/// \headerfile <x86intrin.h>
2931///
2932/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2933///
2934/// \param __a
2935/// A 128-bit integer vector containing the source operand.
2936/// \param __count
2937/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2938/// to right-shift each value in operand \a __a.
2939/// \returns A 128-bit integer vector containing the right-shifted values.
2940static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2941 __m128i __count) {
2942 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2943}
2944
2945/// Right-shifts the 128-bit integer vector operand by the specified
2946/// number of bytes. High-order bits are cleared.
2947///
2948/// \headerfile <x86intrin.h>
2949///
2950/// \code
2951/// __m128i _mm_srli_si128(__m128i a, const int imm);
2952/// \endcode
2953///
2954/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2955///
2956/// \param a
2957/// A 128-bit integer vector containing the source operand.
2958/// \param imm
2959/// An immediate value specifying the number of bytes to right-shift operand
2960/// \a a.
2961/// \returns A 128-bit integer vector containing the right-shifted value.
2962#define _mm_srli_si128(a, imm) \
2963 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2964 (int)(imm)))
2965
2966#define _mm_bsrli_si128(a, imm) \
2967 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2968 (int)(imm)))
2969
2970/// Right-shifts each of 16-bit values in the 128-bit integer vector
2971/// operand by the specified number of bits. High-order bits are cleared.
2972///
2973/// \headerfile <x86intrin.h>
2974///
2975/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2976///
2977/// \param __a
2978/// A 128-bit integer vector containing the source operand.
2979/// \param __count
2980/// An integer value specifying the number of bits to right-shift each value
2981/// in operand \a __a.
2982/// \returns A 128-bit integer vector containing the right-shifted values.
2983static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2984 int __count) {
2985 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2986}
2987
2988/// Right-shifts each of 16-bit values in the 128-bit integer vector
2989/// operand by the specified number of bits. High-order bits are cleared.
2990///
2991/// \headerfile <x86intrin.h>
2992///
2993/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2994///
2995/// \param __a
2996/// A 128-bit integer vector containing the source operand.
2997/// \param __count
2998/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2999/// to right-shift each value in operand \a __a.
3000/// \returns A 128-bit integer vector containing the right-shifted values.
3001static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
3002 __m128i __count) {
3003 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3004}
3005
3006/// Right-shifts each of 32-bit values in the 128-bit integer vector
3007/// operand by the specified number of bits. High-order bits are cleared.
3008///
3009/// \headerfile <x86intrin.h>
3010///
3011/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3012///
3013/// \param __a
3014/// A 128-bit integer vector containing the source operand.
3015/// \param __count
3016/// An integer value specifying the number of bits to right-shift each value
3017/// in operand \a __a.
3018/// \returns A 128-bit integer vector containing the right-shifted values.
3019static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3020 int __count) {
3021 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3022}
3023
3024/// Right-shifts each of 32-bit values in the 128-bit integer vector
3025/// operand by the specified number of bits. High-order bits are cleared.
3026///
3027/// \headerfile <x86intrin.h>
3028///
3029/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3030///
3031/// \param __a
3032/// A 128-bit integer vector containing the source operand.
3033/// \param __count
3034/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3035/// to right-shift each value in operand \a __a.
3036/// \returns A 128-bit integer vector containing the right-shifted values.
3037static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3038 __m128i __count) {
3039 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3040}
3041
3042/// Right-shifts each of 64-bit values in the 128-bit integer vector
3043/// operand by the specified number of bits. High-order bits are cleared.
3044///
3045/// \headerfile <x86intrin.h>
3046///
3047/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3048///
3049/// \param __a
3050/// A 128-bit integer vector containing the source operand.
3051/// \param __count
3052/// An integer value specifying the number of bits to right-shift each value
3053/// in operand \a __a.
3054/// \returns A 128-bit integer vector containing the right-shifted values.
3055static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3056 int __count) {
3057 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3058}
3059
3060/// Right-shifts each of 64-bit values in the 128-bit integer vector
3061/// operand by the specified number of bits. High-order bits are cleared.
3062///
3063/// \headerfile <x86intrin.h>
3064///
3065/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3066///
3067/// \param __a
3068/// A 128-bit integer vector containing the source operand.
3069/// \param __count
3070/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3071/// to right-shift each value in operand \a __a.
3072/// \returns A 128-bit integer vector containing the right-shifted values.
3073static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3074 __m128i __count) {
3075 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3076}
3077
3078/// Compares each of the corresponding 8-bit values of the 128-bit
3079/// integer vectors for equality.
3080///
3081/// Each comparison returns 0x0 for false, 0xFF for true.
3082///
3083/// \headerfile <x86intrin.h>
3084///
3085/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3086///
3087/// \param __a
3088/// A 128-bit integer vector.
3089/// \param __b
3090/// A 128-bit integer vector.
3091/// \returns A 128-bit integer vector containing the comparison results.
3092static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3093 __m128i __b) {
3094 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3095}
3096
3097/// Compares each of the corresponding 16-bit values of the 128-bit
3098/// integer vectors for equality.
3099///
3100/// Each comparison returns 0x0 for false, 0xFFFF for true.
3101///
3102/// \headerfile <x86intrin.h>
3103///
3104/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3105///
3106/// \param __a
3107/// A 128-bit integer vector.
3108/// \param __b
3109/// A 128-bit integer vector.
3110/// \returns A 128-bit integer vector containing the comparison results.
3111static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3112 __m128i __b) {
3113 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3114}
3115
3116/// Compares each of the corresponding 32-bit values of the 128-bit
3117/// integer vectors for equality.
3118///
3119/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3120///
3121/// \headerfile <x86intrin.h>
3122///
3123/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3124///
3125/// \param __a
3126/// A 128-bit integer vector.
3127/// \param __b
3128/// A 128-bit integer vector.
3129/// \returns A 128-bit integer vector containing the comparison results.
3130static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3131 __m128i __b) {
3132 return (__m128i)((__v4si)__a == (__v4si)__b);
3133}
3134
3135/// Compares each of the corresponding signed 8-bit values of the 128-bit
3136/// integer vectors to determine if the values in the first operand are
3137/// greater than those in the second operand.
3138///
3139/// Each comparison returns 0x0 for false, 0xFF for true.
3140///
3141/// \headerfile <x86intrin.h>
3142///
3143/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3144///
3145/// \param __a
3146/// A 128-bit integer vector.
3147/// \param __b
3148/// A 128-bit integer vector.
3149/// \returns A 128-bit integer vector containing the comparison results.
3150static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3151 __m128i __b) {
3152 /* This function always performs a signed comparison, but __v16qi is a char
3153 which may be signed or unsigned, so use __v16qs. */
3154 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3155}
3156
3157/// Compares each of the corresponding signed 16-bit values of the
3158/// 128-bit integer vectors to determine if the values in the first operand
3159/// are greater than those in the second operand.
3160///
3161/// Each comparison returns 0x0 for false, 0xFFFF for true.
3162///
3163/// \headerfile <x86intrin.h>
3164///
3165/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3166///
3167/// \param __a
3168/// A 128-bit integer vector.
3169/// \param __b
3170/// A 128-bit integer vector.
3171/// \returns A 128-bit integer vector containing the comparison results.
3172static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3173 __m128i __b) {
3174 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3175}
3176
3177/// Compares each of the corresponding signed 32-bit values of the
3178/// 128-bit integer vectors to determine if the values in the first operand
3179/// are greater than those in the second operand.
3180///
3181/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3182///
3183/// \headerfile <x86intrin.h>
3184///
3185/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3186///
3187/// \param __a
3188/// A 128-bit integer vector.
3189/// \param __b
3190/// A 128-bit integer vector.
3191/// \returns A 128-bit integer vector containing the comparison results.
3192static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3193 __m128i __b) {
3194 return (__m128i)((__v4si)__a > (__v4si)__b);
3195}
3196
3197/// Compares each of the corresponding signed 8-bit values of the 128-bit
3198/// integer vectors to determine if the values in the first operand are less
3199/// than those in the second operand.
3200///
3201/// Each comparison returns 0x0 for false, 0xFF for true.
3202///
3203/// \headerfile <x86intrin.h>
3204///
3205/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3206///
3207/// \param __a
3208/// A 128-bit integer vector.
3209/// \param __b
3210/// A 128-bit integer vector.
3211/// \returns A 128-bit integer vector containing the comparison results.
3212static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3213 __m128i __b) {
3214 return _mm_cmpgt_epi8(__b, __a);
3215}
3216
3217/// Compares each of the corresponding signed 16-bit values of the
3218/// 128-bit integer vectors to determine if the values in the first operand
3219/// are less than those in the second operand.
3220///
3221/// Each comparison returns 0x0 for false, 0xFFFF for true.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3226///
3227/// \param __a
3228/// A 128-bit integer vector.
3229/// \param __b
3230/// A 128-bit integer vector.
3231/// \returns A 128-bit integer vector containing the comparison results.
3232static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3233 __m128i __b) {
3234 return _mm_cmpgt_epi16(__b, __a);
3235}
3236
3237/// Compares each of the corresponding signed 32-bit values of the
3238/// 128-bit integer vectors to determine if the values in the first operand
3239/// are less than those in the second operand.
3240///
3241/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3242///
3243/// \headerfile <x86intrin.h>
3244///
3245/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3246///
3247/// \param __a
3248/// A 128-bit integer vector.
3249/// \param __b
3250/// A 128-bit integer vector.
3251/// \returns A 128-bit integer vector containing the comparison results.
3252static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3253 __m128i __b) {
3254 return _mm_cmpgt_epi32(__b, __a);
3255}
3256
3257#ifdef __x86_64__
3258/// Converts a 64-bit signed integer value from the second operand into a
3259/// double-precision value and returns it in the lower element of a [2 x
3260/// double] vector; the upper element of the returned vector is copied from
3261/// the upper element of the first operand.
3262///
3263/// \headerfile <x86intrin.h>
3264///
3265/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3266///
3267/// \param __a
3268/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3269/// copied to the upper 64 bits of the destination.
3270/// \param __b
3271/// A 64-bit signed integer operand containing the value to be converted.
3272/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3273/// converted value of the second operand. The upper 64 bits are copied from
3274/// the upper 64 bits of the first operand.
3275static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
3276_mm_cvtsi64_sd(__m128d __a, long long __b) {
3277 __a[0] = __b;
3278 return __a;
3279}
3280
3281/// Converts the first (lower) element of a vector of [2 x double] into a
3282/// 64-bit signed integer value.
3283///
3284/// If the converted value does not fit in a 64-bit integer, raises a
3285/// floating-point invalid exception. If the exception is masked, returns
3286/// the most negative integer.
3287///
3288/// \headerfile <x86intrin.h>
3289///
3290/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3291///
3292/// \param __a
3293/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3294/// conversion.
3295/// \returns A 64-bit signed integer containing the converted value.
3296static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3297 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3298}
3299
3300/// Converts the first (lower) element of a vector of [2 x double] into a
3301/// 64-bit signed truncated (rounded toward zero) integer value.
3302///
3303/// If a converted value does not fit in a 64-bit integer, raises a
3304/// floating-point invalid exception. If the exception is masked, returns
3305/// the most negative integer.
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3310/// instruction.
3311///
3312/// \param __a
3313/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3314/// conversion.
3315/// \returns A 64-bit signed integer containing the converted value.
3316static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3317 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3318}
3319#endif
3320
3321/// Converts a vector of [4 x i32] into a vector of [4 x float].
3322///
3323/// \headerfile <x86intrin.h>
3324///
3325/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3326///
3327/// \param __a
3328/// A 128-bit integer vector.
3329/// \returns A 128-bit vector of [4 x float] containing the converted values.
3330static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
3332 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3333}
3334
3335/// Converts a vector of [4 x float] into a vector of [4 x i32].
3336///
3337/// If a converted value does not fit in a 32-bit integer, raises a
3338/// floating-point invalid exception. If the exception is masked, returns
3339/// the most negative integer.
3340///
3341/// \headerfile <x86intrin.h>
3342///
3343/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3344///
3345/// \param __a
3346/// A 128-bit vector of [4 x float].
3347/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3348/// values.
3349static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3350 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3351}
3352
3353/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3354/// zero) 32-bit integers, returned in a vector of [4 x i32].
3355///
3356/// If a converted value does not fit in a 32-bit integer, raises a
3357/// floating-point invalid exception. If the exception is masked, returns
3358/// the most negative integer.
3359///
3360/// \headerfile <x86intrin.h>
3361///
3362/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3363/// instruction.
3364///
3365/// \param __a
3366/// A 128-bit vector of [4 x float].
3367/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3368static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3369 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3370}
3371
3372/// Returns a vector of [4 x i32] where the lowest element is the input
3373/// operand and the remaining elements are zero.
3374///
3375/// \headerfile <x86intrin.h>
3376///
3377/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3378///
3379/// \param __a
3380/// A 32-bit signed integer operand.
3381/// \returns A 128-bit vector of [4 x i32].
3382static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3383 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3384}
3385
3386/// Returns a vector of [2 x i64] where the lower element is the input
3387/// operand and the upper element is zero.
3388///
3389/// \headerfile <x86intrin.h>
3390///
3391/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3392/// in 64-bit mode.
3393///
3394/// \param __a
3395/// A 64-bit signed integer operand containing the value to be converted.
3396/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3397static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3398 return __extension__(__m128i)(__v2di){__a, 0};
3399}
3400
3401/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3402/// 32-bit signed integer value.
3403///
3404/// \headerfile <x86intrin.h>
3405///
3406/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3407///
3408/// \param __a
3409/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3410/// destination.
3411/// \returns A 32-bit signed integer containing the moved value.
3412static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3413 __v4si __b = (__v4si)__a;
3414 return __b[0];
3415}
3416
3417/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3418/// 64-bit signed integer value.
3419///
3420/// \headerfile <x86intrin.h>
3421///
3422/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3423///
3424/// \param __a
3425/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3426/// destination.
3427/// \returns A 64-bit signed integer containing the moved value.
3428static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3429 return __a[0];
3430}
3431
3432/// Moves packed integer values from an aligned 128-bit memory location
3433/// to elements in a 128-bit integer vector.
3434///
3435/// \headerfile <x86intrin.h>
3436///
3437/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3438///
3439/// \param __p
3440/// An aligned pointer to a memory location containing integer values.
3441/// \returns A 128-bit integer vector containing the moved values.
3442static __inline__ __m128i __DEFAULT_FN_ATTRS
3443_mm_load_si128(__m128i const *__p) {
3444 return *__p;
3445}
3446
3447/// Moves packed integer values from an unaligned 128-bit memory location
3448/// to elements in a 128-bit integer vector.
3449///
3450/// \headerfile <x86intrin.h>
3451///
3452/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3453///
3454/// \param __p
3455/// A pointer to a memory location containing integer values.
3456/// \returns A 128-bit integer vector containing the moved values.
3457static __inline__ __m128i __DEFAULT_FN_ATTRS
3458_mm_loadu_si128(__m128i_u const *__p) {
3459 struct __loadu_si128 {
3460 __m128i_u __v;
3461 } __attribute__((__packed__, __may_alias__));
3462 return ((const struct __loadu_si128 *)__p)->__v;
3463}
3464
3465/// Returns a vector of [2 x i64] where the lower element is taken from
3466/// the lower element of the operand, and the upper element is zero.
3467///
3468/// \headerfile <x86intrin.h>
3469///
3470/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3471///
3472/// \param __p
3473/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3474/// the destination.
3475/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3476/// moved value. The higher order bits are cleared.
3477static __inline__ __m128i __DEFAULT_FN_ATTRS
3478_mm_loadl_epi64(__m128i_u const *__p) {
3479 struct __mm_loadl_epi64_struct {
3480 long long __u;
3481 } __attribute__((__packed__, __may_alias__));
3482 return __extension__(__m128i){
3483 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3484}
3485
3486/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3487/// This could be used as an argument to another intrinsic function where the
3488/// argument is required but the value is not actually used.
3489///
3490/// \headerfile <x86intrin.h>
3491///
3492/// This intrinsic has no corresponding instruction.
3493///
3494/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3495static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3496 return (__m128i)__builtin_ia32_undef128();
3497}
3498
3499/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3500/// the specified 64-bit integer values.
3501///
3502/// \headerfile <x86intrin.h>
3503///
3504/// This intrinsic is a utility function and does not correspond to a specific
3505/// instruction.
3506///
3507/// \param __q1
3508/// A 64-bit integer value used to initialize the upper 64 bits of the
3509/// destination vector of [2 x i64].
3510/// \param __q0
3511/// A 64-bit integer value used to initialize the lower 64 bits of the
3512/// destination vector of [2 x i64].
3513/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3514/// provided in the operands.
3515static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3516_mm_set_epi64x(long long __q1, long long __q0) {
3517 return __extension__(__m128i)(__v2di){__q0, __q1};
3518}
3519
3520/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3521/// the specified 64-bit integer values.
3522///
3523/// \headerfile <x86intrin.h>
3524///
3525/// This intrinsic is a utility function and does not correspond to a specific
3526/// instruction.
3527///
3528/// \param __q1
3529/// A 64-bit integer value used to initialize the upper 64 bits of the
3530/// destination vector of [2 x i64].
3531/// \param __q0
3532/// A 64-bit integer value used to initialize the lower 64 bits of the
3533/// destination vector of [2 x i64].
3534/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3535/// provided in the operands.
3536static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3537_mm_set_epi64(__m64 __q1, __m64 __q0) {
3538 return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]);
3539}
3540
3541/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3542/// the specified 32-bit integer values.
3543///
3544/// \headerfile <x86intrin.h>
3545///
3546/// This intrinsic is a utility function and does not correspond to a specific
3547/// instruction.
3548///
3549/// \param __i3
3550/// A 32-bit integer value used to initialize bits [127:96] of the
3551/// destination vector.
3552/// \param __i2
3553/// A 32-bit integer value used to initialize bits [95:64] of the destination
3554/// vector.
3555/// \param __i1
3556/// A 32-bit integer value used to initialize bits [63:32] of the destination
3557/// vector.
3558/// \param __i0
3559/// A 32-bit integer value used to initialize bits [31:0] of the destination
3560/// vector.
3561/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3562/// provided in the operands.
3563static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3,
3564 int __i2,
3565 int __i1,
3566 int __i0) {
3567 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3568}
3569
3570/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3571/// the specified 16-bit integer values.
3572///
3573/// \headerfile <x86intrin.h>
3574///
3575/// This intrinsic is a utility function and does not correspond to a specific
3576/// instruction.
3577///
3578/// \param __w7
3579/// A 16-bit integer value used to initialize bits [127:112] of the
3580/// destination vector.
3581/// \param __w6
3582/// A 16-bit integer value used to initialize bits [111:96] of the
3583/// destination vector.
3584/// \param __w5
3585/// A 16-bit integer value used to initialize bits [95:80] of the destination
3586/// vector.
3587/// \param __w4
3588/// A 16-bit integer value used to initialize bits [79:64] of the destination
3589/// vector.
3590/// \param __w3
3591/// A 16-bit integer value used to initialize bits [63:48] of the destination
3592/// vector.
3593/// \param __w2
3594/// A 16-bit integer value used to initialize bits [47:32] of the destination
3595/// vector.
3596/// \param __w1
3597/// A 16-bit integer value used to initialize bits [31:16] of the destination
3598/// vector.
3599/// \param __w0
3600/// A 16-bit integer value used to initialize bits [15:0] of the destination
3601/// vector.
3602/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3603/// provided in the operands.
3604static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3605_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3606 short __w2, short __w1, short __w0) {
3607 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3608 __w4, __w5, __w6, __w7};
3609}
3610
3611/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3612/// the specified 8-bit integer values.
3613///
3614/// \headerfile <x86intrin.h>
3615///
3616/// This intrinsic is a utility function and does not correspond to a specific
3617/// instruction.
3618///
3619/// \param __b15
3620/// Initializes bits [127:120] of the destination vector.
3621/// \param __b14
3622/// Initializes bits [119:112] of the destination vector.
3623/// \param __b13
3624/// Initializes bits [111:104] of the destination vector.
3625/// \param __b12
3626/// Initializes bits [103:96] of the destination vector.
3627/// \param __b11
3628/// Initializes bits [95:88] of the destination vector.
3629/// \param __b10
3630/// Initializes bits [87:80] of the destination vector.
3631/// \param __b9
3632/// Initializes bits [79:72] of the destination vector.
3633/// \param __b8
3634/// Initializes bits [71:64] of the destination vector.
3635/// \param __b7
3636/// Initializes bits [63:56] of the destination vector.
3637/// \param __b6
3638/// Initializes bits [55:48] of the destination vector.
3639/// \param __b5
3640/// Initializes bits [47:40] of the destination vector.
3641/// \param __b4
3642/// Initializes bits [39:32] of the destination vector.
3643/// \param __b3
3644/// Initializes bits [31:24] of the destination vector.
3645/// \param __b2
3646/// Initializes bits [23:16] of the destination vector.
3647/// \param __b1
3648/// Initializes bits [15:8] of the destination vector.
3649/// \param __b0
3650/// Initializes bits [7:0] of the destination vector.
3651/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3652/// provided in the operands.
3653static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3654_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3655 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3656 char __b4, char __b3, char __b2, char __b1, char __b0) {
3657 return __extension__(__m128i)(__v16qi){
3658 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3659 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3660}
3661
3662/// Initializes both values in a 128-bit integer vector with the
3663/// specified 64-bit integer value.
3664///
3665/// \headerfile <x86intrin.h>
3666///
3667/// This intrinsic is a utility function and does not correspond to a specific
3668/// instruction.
3669///
3670/// \param __q
3671/// Integer value used to initialize the elements of the destination integer
3672/// vector.
3673/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3674/// elements containing the value provided in the operand.
3675static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3676_mm_set1_epi64x(long long __q) {
3677 return _mm_set_epi64x(__q, __q);
3678}
3679
3680/// Initializes both values in a 128-bit vector of [2 x i64] with the
3681/// specified 64-bit value.
3682///
3683/// \headerfile <x86intrin.h>
3684///
3685/// This intrinsic is a utility function and does not correspond to a specific
3686/// instruction.
3687///
3688/// \param __q
3689/// A 64-bit value used to initialize the elements of the destination integer
3690/// vector.
3691/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3692/// containing the value provided in the operand.
3693static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3694_mm_set1_epi64(__m64 __q) {
3695 return _mm_set_epi64(__q, __q);
3696}
3697
3698/// Initializes all values in a 128-bit vector of [4 x i32] with the
3699/// specified 32-bit value.
3700///
3701/// \headerfile <x86intrin.h>
3702///
3703/// This intrinsic is a utility function and does not correspond to a specific
3704/// instruction.
3705///
3706/// \param __i
3707/// A 32-bit value used to initialize the elements of the destination integer
3708/// vector.
3709/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3710/// containing the value provided in the operand.
3711static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) {
3712 return _mm_set_epi32(__i, __i, __i, __i);
3713}
3714
3715/// Initializes all values in a 128-bit vector of [8 x i16] with the
3716/// specified 16-bit value.
3717///
3718/// \headerfile <x86intrin.h>
3719///
3720/// This intrinsic is a utility function and does not correspond to a specific
3721/// instruction.
3722///
3723/// \param __w
3724/// A 16-bit value used to initialize the elements of the destination integer
3725/// vector.
3726/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3727/// containing the value provided in the operand.
3728static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3729_mm_set1_epi16(short __w) {
3730 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3731}
3732
3733/// Initializes all values in a 128-bit vector of [16 x i8] with the
3734/// specified 8-bit value.
3735///
3736/// \headerfile <x86intrin.h>
3737///
3738/// This intrinsic is a utility function and does not correspond to a specific
3739/// instruction.
3740///
3741/// \param __b
3742/// An 8-bit value used to initialize the elements of the destination integer
3743/// vector.
3744/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3745/// containing the value provided in the operand.
3746static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) {
3747 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3748 __b, __b, __b, __b, __b);
3749}
3750
3751/// Constructs a 128-bit integer vector, initialized in reverse order
3752/// with the specified 64-bit integral values.
3753///
3754/// \headerfile <x86intrin.h>
3755///
3756/// This intrinsic does not correspond to a specific instruction.
3757///
3758/// \param __q0
3759/// A 64-bit integral value used to initialize the lower 64 bits of the
3760/// result.
3761/// \param __q1
3762/// A 64-bit integral value used to initialize the upper 64 bits of the
3763/// result.
3764/// \returns An initialized 128-bit integer vector.
3765static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3766_mm_setr_epi64(__m64 __q0, __m64 __q1) {
3767 return _mm_set_epi64(__q1, __q0);
3768}
3769
3770/// Constructs a 128-bit integer vector, initialized in reverse order
3771/// with the specified 32-bit integral values.
3772///
3773/// \headerfile <x86intrin.h>
3774///
3775/// This intrinsic is a utility function and does not correspond to a specific
3776/// instruction.
3777///
3778/// \param __i0
3779/// A 32-bit integral value used to initialize bits [31:0] of the result.
3780/// \param __i1
3781/// A 32-bit integral value used to initialize bits [63:32] of the result.
3782/// \param __i2
3783/// A 32-bit integral value used to initialize bits [95:64] of the result.
3784/// \param __i3
3785/// A 32-bit integral value used to initialize bits [127:96] of the result.
3786/// \returns An initialized 128-bit integer vector.
3787static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3788_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) {
3789 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3790}
3791
3792/// Constructs a 128-bit integer vector, initialized in reverse order
3793/// with the specified 16-bit integral values.
3794///
3795/// \headerfile <x86intrin.h>
3796///
3797/// This intrinsic is a utility function and does not correspond to a specific
3798/// instruction.
3799///
3800/// \param __w0
3801/// A 16-bit integral value used to initialize bits [15:0] of the result.
3802/// \param __w1
3803/// A 16-bit integral value used to initialize bits [31:16] of the result.
3804/// \param __w2
3805/// A 16-bit integral value used to initialize bits [47:32] of the result.
3806/// \param __w3
3807/// A 16-bit integral value used to initialize bits [63:48] of the result.
3808/// \param __w4
3809/// A 16-bit integral value used to initialize bits [79:64] of the result.
3810/// \param __w5
3811/// A 16-bit integral value used to initialize bits [95:80] of the result.
3812/// \param __w6
3813/// A 16-bit integral value used to initialize bits [111:96] of the result.
3814/// \param __w7
3815/// A 16-bit integral value used to initialize bits [127:112] of the result.
3816/// \returns An initialized 128-bit integer vector.
3817static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3818_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3819 short __w5, short __w6, short __w7) {
3820 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3821}
3822
3823/// Constructs a 128-bit integer vector, initialized in reverse order
3824/// with the specified 8-bit integral values.
3825///
3826/// \headerfile <x86intrin.h>
3827///
3828/// This intrinsic is a utility function and does not correspond to a specific
3829/// instruction.
3830///
3831/// \param __b0
3832/// An 8-bit integral value used to initialize bits [7:0] of the result.
3833/// \param __b1
3834/// An 8-bit integral value used to initialize bits [15:8] of the result.
3835/// \param __b2
3836/// An 8-bit integral value used to initialize bits [23:16] of the result.
3837/// \param __b3
3838/// An 8-bit integral value used to initialize bits [31:24] of the result.
3839/// \param __b4
3840/// An 8-bit integral value used to initialize bits [39:32] of the result.
3841/// \param __b5
3842/// An 8-bit integral value used to initialize bits [47:40] of the result.
3843/// \param __b6
3844/// An 8-bit integral value used to initialize bits [55:48] of the result.
3845/// \param __b7
3846/// An 8-bit integral value used to initialize bits [63:56] of the result.
3847/// \param __b8
3848/// An 8-bit integral value used to initialize bits [71:64] of the result.
3849/// \param __b9
3850/// An 8-bit integral value used to initialize bits [79:72] of the result.
3851/// \param __b10
3852/// An 8-bit integral value used to initialize bits [87:80] of the result.
3853/// \param __b11
3854/// An 8-bit integral value used to initialize bits [95:88] of the result.
3855/// \param __b12
3856/// An 8-bit integral value used to initialize bits [103:96] of the result.
3857/// \param __b13
3858/// An 8-bit integral value used to initialize bits [111:104] of the result.
3859/// \param __b14
3860/// An 8-bit integral value used to initialize bits [119:112] of the result.
3861/// \param __b15
3862/// An 8-bit integral value used to initialize bits [127:120] of the result.
3863/// \returns An initialized 128-bit integer vector.
3864static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3865_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3866 char __b6, char __b7, char __b8, char __b9, char __b10,
3867 char __b11, char __b12, char __b13, char __b14, char __b15) {
3868 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3869 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3870}
3871
3872/// Creates a 128-bit integer vector initialized to zero.
3873///
3874/// \headerfile <x86intrin.h>
3875///
3876/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3877///
3878/// \returns An initialized 128-bit integer vector with all elements set to
3879/// zero.
3880static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) {
3881 return __extension__(__m128i)(__v2di){0LL, 0LL};
3882}
3883
3884/// Stores a 128-bit integer vector to a memory location aligned on a
3885/// 128-bit boundary.
3886///
3887/// \headerfile <x86intrin.h>
3888///
3889/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3890///
3891/// \param __p
3892/// A pointer to an aligned memory location that will receive the integer
3893/// values.
3894/// \param __b
3895/// A 128-bit integer vector containing the values to be moved.
3896static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3897 __m128i __b) {
3898 *__p = __b;
3899}
3900
3901/// Stores a 128-bit integer vector to an unaligned memory location.
3902///
3903/// \headerfile <x86intrin.h>
3904///
3905/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3906///
3907/// \param __p
3908/// A pointer to a memory location that will receive the integer values.
3909/// \param __b
3910/// A 128-bit integer vector containing the values to be moved.
3911static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3912 __m128i __b) {
3913 struct __storeu_si128 {
3914 __m128i_u __v;
3915 } __attribute__((__packed__, __may_alias__));
3916 ((struct __storeu_si128 *)__p)->__v = __b;
3917}
3918
3919/// Stores a 64-bit integer value from the low element of a 128-bit integer
3920/// vector.
3921///
3922/// \headerfile <x86intrin.h>
3923///
3924/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3925///
3926/// \param __p
3927/// A pointer to a 64-bit memory location. The address of the memory
3928/// location does not have to be aligned.
3929/// \param __b
3930/// A 128-bit integer vector containing the value to be stored.
3931static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3932 __m128i __b) {
3933 struct __storeu_si64 {
3934 long long __v;
3935 } __attribute__((__packed__, __may_alias__));
3936 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3937}
3938
3939/// Stores a 32-bit integer value from the low element of a 128-bit integer
3940/// vector.
3941///
3942/// \headerfile <x86intrin.h>
3943///
3944/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3945///
3946/// \param __p
3947/// A pointer to a 32-bit memory location. The address of the memory
3948/// location does not have to be aligned.
3949/// \param __b
3950/// A 128-bit integer vector containing the value to be stored.
3951static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3952 __m128i __b) {
3953 struct __storeu_si32 {
3954 int __v;
3955 } __attribute__((__packed__, __may_alias__));
3956 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3957}
3958
3959/// Stores a 16-bit integer value from the low element of a 128-bit integer
3960/// vector.
3961///
3962/// \headerfile <x86intrin.h>
3963///
3964/// This intrinsic does not correspond to a specific instruction.
3965///
3966/// \param __p
3967/// A pointer to a 16-bit memory location. The address of the memory
3968/// location does not have to be aligned.
3969/// \param __b
3970/// A 128-bit integer vector containing the value to be stored.
3971static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3972 __m128i __b) {
3973 struct __storeu_si16 {
3974 short __v;
3975 } __attribute__((__packed__, __may_alias__));
3976 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3977}
3978
3979/// Moves bytes selected by the mask from the first operand to the
3980/// specified unaligned memory location. When a mask bit is 1, the
3981/// corresponding byte is written, otherwise it is not written.
3982///
3983/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3984/// used again soon). Exception and trap behavior for elements not selected
3985/// for storage to memory are implementation dependent.
3986///
3987/// \headerfile <x86intrin.h>
3988///
3989/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3990/// instruction.
3991///
3992/// \param __d
3993/// A 128-bit integer vector containing the values to be moved.
3994/// \param __n
3995/// A 128-bit integer vector containing the mask. The most significant bit of
3996/// each byte represents the mask bits.
3997/// \param __p
3998/// A pointer to an unaligned 128-bit memory location where the specified
3999/// values are moved.
4000static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
4001 __m128i __n,
4002 char *__p) {
4003 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4004}
4005
4006/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4007/// a memory location.
4008///
4009/// \headerfile <x86intrin.h>
4010///
4011/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4012///
4013/// \param __p
4014/// A pointer to a 64-bit memory location that will receive the lower 64 bits
4015/// of the integer vector parameter.
4016/// \param __a
4017/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4018/// value to be stored.
4019static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4020 __m128i __a) {
4021 struct __mm_storel_epi64_struct {
4022 long long __u;
4023 } __attribute__((__packed__, __may_alias__));
4024 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4025}
4026
4027/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4028/// aligned memory location.
4029///
4030/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4031/// used again soon).
4032///
4033/// \headerfile <x86intrin.h>
4034///
4035/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4036///
4037/// \param __p
4038/// A pointer to the 128-bit aligned memory location used to store the value.
4039/// \param __a
4040/// A vector of [2 x double] containing the 64-bit values to be stored.
4041static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4042 __m128d __a) {
4043 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4044}
4045
4046/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4047///
4048/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4049/// used again soon).
4050///
4051/// \headerfile <x86intrin.h>
4052///
4053/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4054///
4055/// \param __p
4056/// A pointer to the 128-bit aligned memory location used to store the value.
4057/// \param __a
4058/// A 128-bit integer vector containing the values to be stored.
4059static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4060 __m128i __a) {
4061 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4062}
4063
4064/// Stores a 32-bit integer value in the specified memory location.
4065///
4066/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4067/// used again soon).
4068///
4069/// \headerfile <x86intrin.h>
4070///
4071/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4072///
4073/// \param __p
4074/// A pointer to the 32-bit memory location used to store the value.
4075/// \param __a
4076/// A 32-bit integer containing the value to be stored.
4077static __inline__ void
4078 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4079 _mm_stream_si32(void *__p, int __a) {
4080 __builtin_ia32_movnti((int *)__p, __a);
4081}
4082
4083#ifdef __x86_64__
4084/// Stores a 64-bit integer value in the specified memory location.
4085///
4086/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4087/// used again soon).
4088///
4089/// \headerfile <x86intrin.h>
4090///
4091/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4092///
4093/// \param __p
4094/// A pointer to the 64-bit memory location used to store the value.
4095/// \param __a
4096/// A 64-bit integer containing the value to be stored.
4097static __inline__ void
4098 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4099 _mm_stream_si64(void *__p, long long __a) {
4100 __builtin_ia32_movnti64((long long *)__p, __a);
4101}
4102#endif
4103
4104#if defined(__cplusplus)
4105extern "C" {
4106#endif
4107
4108/// The cache line containing \a __p is flushed and invalidated from all
4109/// caches in the coherency domain.
4110///
4111/// \headerfile <x86intrin.h>
4112///
4113/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4114///
4115/// \param __p
4116/// A pointer to the memory location used to identify the cache line to be
4117/// flushed.
4118void _mm_clflush(void const *__p);
4119
4120/// Forces strong memory ordering (serialization) between load
4121/// instructions preceding this instruction and load instructions following
4122/// this instruction, ensuring the system completes all previous loads before
4123/// executing subsequent loads.
4124///
4125/// \headerfile <x86intrin.h>
4126///
4127/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4128///
4129void _mm_lfence(void);
4130
4131/// Forces strong memory ordering (serialization) between load and store
4132/// instructions preceding this instruction and load and store instructions
4133/// following this instruction, ensuring that the system completes all
4134/// previous memory accesses before executing subsequent memory accesses.
4135///
4136/// \headerfile <x86intrin.h>
4137///
4138/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4139///
4140void _mm_mfence(void);
4141
4142#if defined(__cplusplus)
4143} // extern "C"
4144#endif
4145
4146/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4147/// vector operands into 8-bit signed integers, and packs the results into
4148/// the destination.
4149///
4150/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4151/// less than 0x80 are saturated to 0x80.
4152///
4153/// \headerfile <x86intrin.h>
4154///
4155/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4156///
4157/// \param __a
4158/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4159/// written to the lower 64 bits of the result.
4160/// \param __b
4161/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4162/// written to the higher 64 bits of the result.
4163/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4164static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4165 __m128i __b) {
4166 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4167}
4168
4169/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4170/// vector operands into 16-bit signed integers, and packs the results into
4171/// the destination.
4172///
4173/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4174/// values less than 0x8000 are saturated to 0x8000.
4175///
4176/// \headerfile <x86intrin.h>
4177///
4178/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4179///
4180/// \param __a
4181/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4182/// are written to the lower 64 bits of the result.
4183/// \param __b
4184/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4185/// are written to the higher 64 bits of the result.
4186/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4187static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4188 __m128i __b) {
4189 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4190}
4191
4192/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4193/// vector operands into 8-bit unsigned integers, and packs the results into
4194/// the destination.
4195///
4196/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4197/// are saturated to 0x00.
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4202///
4203/// \param __a
4204/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4205/// written to the lower 64 bits of the result.
4206/// \param __b
4207/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4208/// written to the higher 64 bits of the result.
4209/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4210static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4211 __m128i __b) {
4212 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4213}
4214
4215/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4216/// the immediate-value parameter as a selector.
4217///
4218/// \headerfile <x86intrin.h>
4219///
4220/// \code
4221/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4222/// \endcode
4223///
4224/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4225///
4226/// \param a
4227/// A 128-bit integer vector.
4228/// \param imm
4229/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4230/// to bits[15:0] of the result. \n
4231/// 000: assign values from bits [15:0] of \a a. \n
4232/// 001: assign values from bits [31:16] of \a a. \n
4233/// 010: assign values from bits [47:32] of \a a. \n
4234/// 011: assign values from bits [63:48] of \a a. \n
4235/// 100: assign values from bits [79:64] of \a a. \n
4236/// 101: assign values from bits [95:80] of \a a. \n
4237/// 110: assign values from bits [111:96] of \a a. \n
4238/// 111: assign values from bits [127:112] of \a a.
4239/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4240/// integer vector parameter and the remaining bits are assigned zeros.
4241#define _mm_extract_epi16(a, imm) \
4242 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4243 (int)(imm)))
4244
4245/// Constructs a 128-bit integer vector by first making a copy of the
4246/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4247/// of an integer parameter into an offset specified by the immediate-value
4248/// parameter.
4249///
4250/// \headerfile <x86intrin.h>
4251///
4252/// \code
4253/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4254/// \endcode
4255///
4256/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4257///
4258/// \param a
4259/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4260/// result and then one of the eight elements in the result is replaced by
4261/// the lower 16 bits of \a b.
4262/// \param b
4263/// An integer. The lower 16 bits of this parameter are written to the
4264/// result beginning at an offset specified by \a imm.
4265/// \param imm
4266/// An immediate value specifying the bit offset in the result at which the
4267/// lower 16 bits of \a b are written.
4268/// \returns A 128-bit integer vector containing the constructed values.
4269#define _mm_insert_epi16(a, b, imm) \
4270 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4271 (int)(imm)))
4272
4273/// Copies the values of the most significant bits from each 8-bit
4274/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4275/// value, zero-extends the value, and writes it to the destination.
4276///
4277/// \headerfile <x86intrin.h>
4278///
4279/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4280///
4281/// \param __a
4282/// A 128-bit integer vector containing the values with bits to be extracted.
4283/// \returns The most significant bits from each 8-bit element in \a __a,
4284/// written to bits [15:0]. The other bits are assigned zeros.
4285static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4286 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4287}
4288
4289/// Constructs a 128-bit integer vector by shuffling four 32-bit
4290/// elements of a 128-bit integer vector parameter, using the immediate-value
4291/// parameter as a specifier.
4292///
4293/// \headerfile <x86intrin.h>
4294///
4295/// \code
4296/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4297/// \endcode
4298///
4299/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4300///
4301/// \param a
4302/// A 128-bit integer vector containing the values to be copied.
4303/// \param imm
4304/// An immediate value containing an 8-bit value specifying which elements to
4305/// copy from a. The destinations within the 128-bit destination are assigned
4306/// values as follows: \n
4307/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4308/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4309/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4310/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4311/// Bit value assignments: \n
4312/// 00: assign values from bits [31:0] of \a a. \n
4313/// 01: assign values from bits [63:32] of \a a. \n
4314/// 10: assign values from bits [95:64] of \a a. \n
4315/// 11: assign values from bits [127:96] of \a a. \n
4316/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4317/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4318/// <c>[b6, b4, b2, b0]</c>.
4319/// \returns A 128-bit integer vector containing the shuffled values.
4320#define _mm_shuffle_epi32(a, imm) \
4321 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4322
4323/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4324/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4325/// value parameter as a specifier.
4326///
4327/// \headerfile <x86intrin.h>
4328///
4329/// \code
4330/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4331/// \endcode
4332///
4333/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4334///
4335/// \param a
4336/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4337/// [127:64] of the result.
4338/// \param imm
4339/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4340/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4341/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4342/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4343/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4344/// Bit value assignments: \n
4345/// 00: assign values from bits [15:0] of \a a. \n
4346/// 01: assign values from bits [31:16] of \a a. \n
4347/// 10: assign values from bits [47:32] of \a a. \n
4348/// 11: assign values from bits [63:48] of \a a. \n
4349/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4350/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4351/// <c>[b6, b4, b2, b0]</c>.
4352/// \returns A 128-bit integer vector containing the shuffled values.
4353#define _mm_shufflelo_epi16(a, imm) \
4354 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4355
4356/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4357/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4358/// value parameter as a specifier.
4359///
4360/// \headerfile <x86intrin.h>
4361///
4362/// \code
4363/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4364/// \endcode
4365///
4366/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4367///
4368/// \param a
4369/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4370/// [63:0] of the result.
4371/// \param imm
4372/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4373/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4374/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4375/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4376/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4377/// Bit value assignments: \n
4378/// 00: assign values from bits [79:64] of \a a. \n
4379/// 01: assign values from bits [95:80] of \a a. \n
4380/// 10: assign values from bits [111:96] of \a a. \n
4381/// 11: assign values from bits [127:112] of \a a. \n
4382/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4383/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4384/// <c>[b6, b4, b2, b0]</c>.
4385/// \returns A 128-bit integer vector containing the shuffled values.
4386#define _mm_shufflehi_epi16(a, imm) \
4387 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4388
4389/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4390/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4395/// instruction.
4396///
4397/// \param __a
4398/// A 128-bit vector of [16 x i8].
4399/// Bits [71:64] are written to bits [7:0] of the result. \n
4400/// Bits [79:72] are written to bits [23:16] of the result. \n
4401/// Bits [87:80] are written to bits [39:32] of the result. \n
4402/// Bits [95:88] are written to bits [55:48] of the result. \n
4403/// Bits [103:96] are written to bits [71:64] of the result. \n
4404/// Bits [111:104] are written to bits [87:80] of the result. \n
4405/// Bits [119:112] are written to bits [103:96] of the result. \n
4406/// Bits [127:120] are written to bits [119:112] of the result.
4407/// \param __b
4408/// A 128-bit vector of [16 x i8]. \n
4409/// Bits [71:64] are written to bits [15:8] of the result. \n
4410/// Bits [79:72] are written to bits [31:24] of the result. \n
4411/// Bits [87:80] are written to bits [47:40] of the result. \n
4412/// Bits [95:88] are written to bits [63:56] of the result. \n
4413/// Bits [103:96] are written to bits [79:72] of the result. \n
4414/// Bits [111:104] are written to bits [95:88] of the result. \n
4415/// Bits [119:112] are written to bits [111:104] of the result. \n
4416/// Bits [127:120] are written to bits [127:120] of the result.
4417/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4418static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4419 __m128i __b) {
4420 return (__m128i)__builtin_shufflevector(
4421 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4422 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4423}
4424
4425/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4426/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4427///
4428/// \headerfile <x86intrin.h>
4429///
4430/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4431/// instruction.
4432///
4433/// \param __a
4434/// A 128-bit vector of [8 x i16].
4435/// Bits [79:64] are written to bits [15:0] of the result. \n
4436/// Bits [95:80] are written to bits [47:32] of the result. \n
4437/// Bits [111:96] are written to bits [79:64] of the result. \n
4438/// Bits [127:112] are written to bits [111:96] of the result.
4439/// \param __b
4440/// A 128-bit vector of [8 x i16].
4441/// Bits [79:64] are written to bits [31:16] of the result. \n
4442/// Bits [95:80] are written to bits [63:48] of the result. \n
4443/// Bits [111:96] are written to bits [95:80] of the result. \n
4444/// Bits [127:112] are written to bits [127:112] of the result.
4445/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4446static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4447 __m128i __b) {
4448 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4449 8 + 5, 6, 8 + 6, 7, 8 + 7);
4450}
4451
4452/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4453/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4454///
4455/// \headerfile <x86intrin.h>
4456///
4457/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4458/// instruction.
4459///
4460/// \param __a
4461/// A 128-bit vector of [4 x i32]. \n
4462/// Bits [95:64] are written to bits [31:0] of the destination. \n
4463/// Bits [127:96] are written to bits [95:64] of the destination.
4464/// \param __b
4465/// A 128-bit vector of [4 x i32]. \n
4466/// Bits [95:64] are written to bits [64:32] of the destination. \n
4467/// Bits [127:96] are written to bits [127:96] of the destination.
4468/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4469static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4470 __m128i __b) {
4471 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4472 4 + 3);
4473}
4474
4475/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4476/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4477///
4478/// \headerfile <x86intrin.h>
4479///
4480/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4481/// instruction.
4482///
4483/// \param __a
4484/// A 128-bit vector of [2 x i64]. \n
4485/// Bits [127:64] are written to bits [63:0] of the destination.
4486/// \param __b
4487/// A 128-bit vector of [2 x i64]. \n
4488/// Bits [127:64] are written to bits [127:64] of the destination.
4489/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4490static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4491 __m128i __b) {
4492 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4493}
4494
4495/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4496/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4497///
4498/// \headerfile <x86intrin.h>
4499///
4500/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4501/// instruction.
4502///
4503/// \param __a
4504/// A 128-bit vector of [16 x i8]. \n
4505/// Bits [7:0] are written to bits [7:0] of the result. \n
4506/// Bits [15:8] are written to bits [23:16] of the result. \n
4507/// Bits [23:16] are written to bits [39:32] of the result. \n
4508/// Bits [31:24] are written to bits [55:48] of the result. \n
4509/// Bits [39:32] are written to bits [71:64] of the result. \n
4510/// Bits [47:40] are written to bits [87:80] of the result. \n
4511/// Bits [55:48] are written to bits [103:96] of the result. \n
4512/// Bits [63:56] are written to bits [119:112] of the result.
4513/// \param __b
4514/// A 128-bit vector of [16 x i8].
4515/// Bits [7:0] are written to bits [15:8] of the result. \n
4516/// Bits [15:8] are written to bits [31:24] of the result. \n
4517/// Bits [23:16] are written to bits [47:40] of the result. \n
4518/// Bits [31:24] are written to bits [63:56] of the result. \n
4519/// Bits [39:32] are written to bits [79:72] of the result. \n
4520/// Bits [47:40] are written to bits [95:88] of the result. \n
4521/// Bits [55:48] are written to bits [111:104] of the result. \n
4522/// Bits [63:56] are written to bits [127:120] of the result.
4523/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4524static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4525 __m128i __b) {
4526 return (__m128i)__builtin_shufflevector(
4527 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4528 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4529}
4530
4531/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4532/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4533/// [8 x i16].
4534///
4535/// \headerfile <x86intrin.h>
4536///
4537/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4538/// instruction.
4539///
4540/// \param __a
4541/// A 128-bit vector of [8 x i16].
4542/// Bits [15:0] are written to bits [15:0] of the result. \n
4543/// Bits [31:16] are written to bits [47:32] of the result. \n
4544/// Bits [47:32] are written to bits [79:64] of the result. \n
4545/// Bits [63:48] are written to bits [111:96] of the result.
4546/// \param __b
4547/// A 128-bit vector of [8 x i16].
4548/// Bits [15:0] are written to bits [31:16] of the result. \n
4549/// Bits [31:16] are written to bits [63:48] of the result. \n
4550/// Bits [47:32] are written to bits [95:80] of the result. \n
4551/// Bits [63:48] are written to bits [127:112] of the result.
4552/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4553static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4554 __m128i __b) {
4555 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4556 8 + 1, 2, 8 + 2, 3, 8 + 3);
4557}
4558
4559/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4560/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4561///
4562/// \headerfile <x86intrin.h>
4563///
4564/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4565/// instruction.
4566///
4567/// \param __a
4568/// A 128-bit vector of [4 x i32]. \n
4569/// Bits [31:0] are written to bits [31:0] of the destination. \n
4570/// Bits [63:32] are written to bits [95:64] of the destination.
4571/// \param __b
4572/// A 128-bit vector of [4 x i32]. \n
4573/// Bits [31:0] are written to bits [64:32] of the destination. \n
4574/// Bits [63:32] are written to bits [127:96] of the destination.
4575/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4576static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4577 __m128i __b) {
4578 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4579 4 + 1);
4580}
4581
4582/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4583/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4584///
4585/// \headerfile <x86intrin.h>
4586///
4587/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4588/// instruction.
4589///
4590/// \param __a
4591/// A 128-bit vector of [2 x i64]. \n
4592/// Bits [63:0] are written to bits [63:0] of the destination. \n
4593/// \param __b
4594/// A 128-bit vector of [2 x i64]. \n
4595/// Bits [63:0] are written to bits [127:64] of the destination. \n
4596/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4597static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4598 __m128i __b) {
4599 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4600}
4601
4602/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4603/// integer.
4604///
4605/// \headerfile <x86intrin.h>
4606///
4607/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4608///
4609/// \param __a
4610/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4611/// destination.
4612/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4613static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
4615 return (__m64)__a[0];
4616}
4617
4618/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4619/// upper bits.
4620///
4621/// \headerfile <x86intrin.h>
4622///
4623/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4624///
4625/// \param __a
4626/// A 64-bit value.
4627/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4628/// the operand. The upper 64 bits are assigned zeros.
4629static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4631 return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
4632}
4633
4634/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4635/// integer vector, zeroing the upper bits.
4636///
4637/// \headerfile <x86intrin.h>
4638///
4639/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4640///
4641/// \param __a
4642/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4643/// destination.
4644/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4645/// the operand. The upper 64 bits are assigned zeros.
4646static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4648 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4649}
4650
4651/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4652/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4653/// double].
4654///
4655/// \headerfile <x86intrin.h>
4656///
4657/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4658///
4659/// \param __a
4660/// A 128-bit vector of [2 x double]. \n
4661/// Bits [127:64] are written to bits [63:0] of the destination.
4662/// \param __b
4663/// A 128-bit vector of [2 x double]. \n
4664/// Bits [127:64] are written to bits [127:64] of the destination.
4665/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4666static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4667_mm_unpackhi_pd(__m128d __a, __m128d __b) {
4668 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4669}
4670
4671/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4672/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4673/// double].
4674///
4675/// \headerfile <x86intrin.h>
4676///
4677/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4678///
4679/// \param __a
4680/// A 128-bit vector of [2 x double]. \n
4681/// Bits [63:0] are written to bits [63:0] of the destination.
4682/// \param __b
4683/// A 128-bit vector of [2 x double]. \n
4684/// Bits [63:0] are written to bits [127:64] of the destination.
4685/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4686static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4687_mm_unpacklo_pd(__m128d __a, __m128d __b) {
4688 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4689}
4690
4691/// Extracts the sign bits of the double-precision values in the 128-bit
4692/// vector of [2 x double], zero-extends the value, and writes it to the
4693/// low-order bits of the destination.
4694///
4695/// \headerfile <x86intrin.h>
4696///
4697/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4698///
4699/// \param __a
4700/// A 128-bit vector of [2 x double] containing the values with sign bits to
4701/// be extracted.
4702/// \returns The sign bits from each of the double-precision elements in \a __a,
4703/// written to bits [1:0]. The remaining bits are assigned values of zero.
4704static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4705 return __builtin_ia32_movmskpd((__v2df)__a);
4706}
4707
4708/// Constructs a 128-bit floating-point vector of [2 x double] from two
4709/// 128-bit vector parameters of [2 x double], using the immediate-value
4710/// parameter as a specifier.
4711///
4712/// \headerfile <x86intrin.h>
4713///
4714/// \code
4715/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4716/// \endcode
4717///
4718/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4719///
4720/// \param a
4721/// A 128-bit vector of [2 x double].
4722/// \param b
4723/// A 128-bit vector of [2 x double].
4724/// \param i
4725/// An 8-bit immediate value. The least significant two bits specify which
4726/// elements to copy from \a a and \a b: \n
4727/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4728/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4729/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4730/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4731/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4732/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4733/// <c>[b1, b0]</c>.
4734/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4735#define _mm_shuffle_pd(a, b, i) \
4736 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4737 (int)(i)))
4738
4739/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4740/// floating-point vector of [4 x float].
4741///
4742/// \headerfile <x86intrin.h>
4743///
4744/// This intrinsic has no corresponding instruction.
4745///
4746/// \param __a
4747/// A 128-bit floating-point vector of [2 x double].
4748/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4749/// bitwise pattern as the parameter.
4750static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4752 return (__m128)__a;
4753}
4754
4755/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4756/// integer vector.
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// This intrinsic has no corresponding instruction.
4761///
4762/// \param __a
4763/// A 128-bit floating-point vector of [2 x double].
4764/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4765/// parameter.
4766static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4768 return (__m128i)__a;
4769}
4770
4771/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4772/// floating-point vector of [2 x double].
4773///
4774/// \headerfile <x86intrin.h>
4775///
4776/// This intrinsic has no corresponding instruction.
4777///
4778/// \param __a
4779/// A 128-bit floating-point vector of [4 x float].
4780/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4781/// bitwise pattern as the parameter.
4782static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4784 return (__m128d)__a;
4785}
4786
4787/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4788/// integer vector.
4789///
4790/// \headerfile <x86intrin.h>
4791///
4792/// This intrinsic has no corresponding instruction.
4793///
4794/// \param __a
4795/// A 128-bit floating-point vector of [4 x float].
4796/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4797/// parameter.
4798static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4800 return (__m128i)__a;
4801}
4802
4803/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4804/// of [4 x float].
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// This intrinsic has no corresponding instruction.
4809///
4810/// \param __a
4811/// A 128-bit integer vector.
4812/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4813/// bitwise pattern as the parameter.
4814static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4816 return (__m128)__a;
4817}
4818
4819/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4820/// of [2 x double].
4821///
4822/// \headerfile <x86intrin.h>
4823///
4824/// This intrinsic has no corresponding instruction.
4825///
4826/// \param __a
4827/// A 128-bit integer vector.
4828/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4829/// bitwise pattern as the parameter.
4830static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4832 return (__m128d)__a;
4833}
4834
4835/// Compares each of the corresponding double-precision values of two
4836/// 128-bit vectors of [2 x double], using the operation specified by the
4837/// immediate integer operand.
4838///
4839/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4840/// If either value in a comparison is NaN, comparisons that are ordered
4841/// return false, and comparisons that are unordered return true.
4842///
4843/// \headerfile <x86intrin.h>
4844///
4845/// \code
4846/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4847/// \endcode
4848///
4849/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4850///
4851/// \param a
4852/// A 128-bit vector of [2 x double].
4853/// \param b
4854/// A 128-bit vector of [2 x double].
4855/// \param c
4856/// An immediate integer operand, with bits [4:0] specifying which comparison
4857/// operation to use: \n
4858/// 0x00: Equal (ordered, non-signaling) \n
4859/// 0x01: Less-than (ordered, signaling) \n
4860/// 0x02: Less-than-or-equal (ordered, signaling) \n
4861/// 0x03: Unordered (non-signaling) \n
4862/// 0x04: Not-equal (unordered, non-signaling) \n
4863/// 0x05: Not-less-than (unordered, signaling) \n
4864/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4865/// 0x07: Ordered (non-signaling) \n
4866/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4867#define _mm_cmp_pd(a, b, c) \
4868 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4869 (c)))
4870
4871/// Compares each of the corresponding scalar double-precision values of
4872/// two 128-bit vectors of [2 x double], using the operation specified by the
4873/// immediate integer operand.
4874///
4875/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4876/// If either value in a comparison is NaN, comparisons that are ordered
4877/// return false, and comparisons that are unordered return true.
4878///
4879/// \headerfile <x86intrin.h>
4880///
4881/// \code
4882/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4883/// \endcode
4884///
4885/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4886///
4887/// \param a
4888/// A 128-bit vector of [2 x double].
4889/// \param b
4890/// A 128-bit vector of [2 x double].
4891/// \param c
4892/// An immediate integer operand, with bits [4:0] specifying which comparison
4893/// operation to use: \n
4894/// 0x00: Equal (ordered, non-signaling) \n
4895/// 0x01: Less-than (ordered, signaling) \n
4896/// 0x02: Less-than-or-equal (ordered, signaling) \n
4897/// 0x03: Unordered (non-signaling) \n
4898/// 0x04: Not-equal (unordered, non-signaling) \n
4899/// 0x05: Not-less-than (unordered, signaling) \n
4900/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4901/// 0x07: Ordered (non-signaling) \n
4902/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4903#define _mm_cmp_sd(a, b, c) \
4904 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4905 (c)))
4906
4907#if defined(__cplusplus)
4908extern "C" {
4909#endif
4910
4911/// Indicates that a spin loop is being executed for the purposes of
4912/// optimizing power consumption during the loop.
4913///
4914/// \headerfile <x86intrin.h>
4915///
4916/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4917///
4918void _mm_pause(void);
4919
4920#if defined(__cplusplus)
4921} // extern "C"
4922#endif
4923
4924#undef __anyext128
4925#undef __trunc64
4926#undef __DEFAULT_FN_ATTRS
4927#undef __DEFAULT_FN_ATTRS_CONSTEXPR
4928
4929#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4930
4931#define _MM_DENORMALS_ZERO_ON (0x0040U)
4932#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4933
4934#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4935
4936#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4937#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4938 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4939
4940#endif /* __EMMINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1557
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1059
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4553
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1972
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4767
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1035
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4614
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:227
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1862
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4210
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:129
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2377
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:600
#define __anyext128(x)
Definition: emmintrin.h:70
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1665
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4059
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2828
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2679
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:835
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:208
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1201
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1628
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3428
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:4000
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1177
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1225
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2172
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1792
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1824
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1572
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3092
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3019
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3232
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2525
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:757
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4019
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:89
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3252
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2714
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:534
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3971
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:430
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:313
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1703
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1684
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2452
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:782
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2698
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3150
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3037
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3729
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3111
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2434
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2543
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1153
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2883
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2339
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2276
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1952
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3001
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4524
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:413
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:808
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3172
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1332
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:988
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3537
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:57
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2598
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:732
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3880
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4597
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2921
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3516
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4751
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4704
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4783
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3130
static __inline__ void int __a
Definition: emmintrin.h:4079
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1842
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2662
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3951
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1506
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3766
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:492
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4490
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1487
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2577
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3397
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1444
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4576
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3192
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:268
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:186
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4446
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1354
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3495
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1249
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2301
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2238
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3382
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3676
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2560
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1646
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3055
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2774
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3865
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:621
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1396
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1373
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1754
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2216
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2194
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2620
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2508
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1774
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4469
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1291
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4647
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2396
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4799
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3694
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1107
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:579
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:376
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2415
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1011
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1896
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2320
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2864
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4164
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2902
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1612
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1083
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1876
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:663
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3563
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:168
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3605
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2028
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2810
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3818
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2491
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:962
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:862
#define __DEFAULT_FN_ATTRS_CONSTEXPR
Definition: emmintrin.h:65
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2047
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1933
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3458
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4187
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2792
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2071
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:937
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:513
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3443
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:707
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4041
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3212
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2130
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2114
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2470
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:912
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3073
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1310
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:887
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3368
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1728
#define __trunc64(x)
Definition: emmintrin.h:68
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:251
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1419
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:292
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2846
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:107
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3931
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1912
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4630
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1588
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4667
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:359
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2358
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3896
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3788
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4687
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:450
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3478
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3412
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4815
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2257
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4831
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2151
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2011
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:397
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4418
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1525
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3331
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1541
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2983
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1989
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3711
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1273
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3654
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2092
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1808
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2641
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1131
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:556
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:471
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3911
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2940
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:147
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1467
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3349
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3746
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2731
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:642
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1342