clang 20.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26/* Type defines. */
27typedef double __v2df __attribute__((__vector_size__(16)));
28typedef long long __v2di __attribute__((__vector_size__(16)));
29typedef short __v8hi __attribute__((__vector_size__(16)));
30typedef char __v16qi __attribute__((__vector_size__(16)));
31
32/* Unsigned types */
33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37/* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41#ifdef __SSE2__
42/* Both _Float16 and __bf16 require SSE2 being enabled. */
43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49#endif
50
51/* Define the default attributes for the functions in this file. */
52#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, \
55 __target__("sse2,no-evex512"), __min_vector_width__(128)))
56#else
57#define __DEFAULT_FN_ATTRS \
58 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
59 __min_vector_width__(128)))
60#endif
61
62#define __trunc64(x) \
63 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
64#define __anyext128(x) \
65 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
66 1, -1, -1)
67
68/// Adds lower double-precision values in both operands and returns the
69/// sum in the lower 64 bits of the result. The upper 64 bits of the result
70/// are copied from the upper double-precision value of the first operand.
71///
72/// \headerfile <x86intrin.h>
73///
74/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
75///
76/// \param __a
77/// A 128-bit vector of [2 x double] containing one of the source operands.
78/// \param __b
79/// A 128-bit vector of [2 x double] containing one of the source operands.
80/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
81/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
82/// from the upper 64 bits of the first source operand.
83static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
84 __m128d __b) {
85 __a[0] += __b[0];
86 return __a;
87}
88
89/// Adds two 128-bit vectors of [2 x double].
90///
91/// \headerfile <x86intrin.h>
92///
93/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
94///
95/// \param __a
96/// A 128-bit vector of [2 x double] containing one of the source operands.
97/// \param __b
98/// A 128-bit vector of [2 x double] containing one of the source operands.
99/// \returns A 128-bit vector of [2 x double] containing the sums of both
100/// operands.
101static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
102 __m128d __b) {
103 return (__m128d)((__v2df)__a + (__v2df)__b);
104}
105
106/// Subtracts the lower double-precision value of the second operand
107/// from the lower double-precision value of the first operand and returns
108/// the difference in the lower 64 bits of the result. The upper 64 bits of
109/// the result are copied from the upper double-precision value of the first
110/// operand.
111///
112/// \headerfile <x86intrin.h>
113///
114/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
115///
116/// \param __a
117/// A 128-bit vector of [2 x double] containing the minuend.
118/// \param __b
119/// A 128-bit vector of [2 x double] containing the subtrahend.
120/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
121/// difference of the lower 64 bits of both operands. The upper 64 bits are
122/// copied from the upper 64 bits of the first source operand.
123static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
124 __m128d __b) {
125 __a[0] -= __b[0];
126 return __a;
127}
128
129/// Subtracts two 128-bit vectors of [2 x double].
130///
131/// \headerfile <x86intrin.h>
132///
133/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
134///
135/// \param __a
136/// A 128-bit vector of [2 x double] containing the minuend.
137/// \param __b
138/// A 128-bit vector of [2 x double] containing the subtrahend.
139/// \returns A 128-bit vector of [2 x double] containing the differences between
140/// both operands.
141static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
142 __m128d __b) {
143 return (__m128d)((__v2df)__a - (__v2df)__b);
144}
145
146/// Multiplies lower double-precision values in both operands and returns
147/// the product in the lower 64 bits of the result. The upper 64 bits of the
148/// result are copied from the upper double-precision value of the first
149/// operand.
150///
151/// \headerfile <x86intrin.h>
152///
153/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
154///
155/// \param __a
156/// A 128-bit vector of [2 x double] containing one of the source operands.
157/// \param __b
158/// A 128-bit vector of [2 x double] containing one of the source operands.
159/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
160/// product of the lower 64 bits of both operands. The upper 64 bits are
161/// copied from the upper 64 bits of the first source operand.
162static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
163 __m128d __b) {
164 __a[0] *= __b[0];
165 return __a;
166}
167
168/// Multiplies two 128-bit vectors of [2 x double].
169///
170/// \headerfile <x86intrin.h>
171///
172/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
173///
174/// \param __a
175/// A 128-bit vector of [2 x double] containing one of the operands.
176/// \param __b
177/// A 128-bit vector of [2 x double] containing one of the operands.
178/// \returns A 128-bit vector of [2 x double] containing the products of both
179/// operands.
180static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
181 __m128d __b) {
182 return (__m128d)((__v2df)__a * (__v2df)__b);
183}
184
185/// Divides the lower double-precision value of the first operand by the
186/// lower double-precision value of the second operand and returns the
187/// quotient in the lower 64 bits of the result. The upper 64 bits of the
188/// result are copied from the upper double-precision value of the first
189/// operand.
190///
191/// \headerfile <x86intrin.h>
192///
193/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
194///
195/// \param __a
196/// A 128-bit vector of [2 x double] containing the dividend.
197/// \param __b
198/// A 128-bit vector of [2 x double] containing divisor.
199/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
200/// quotient of the lower 64 bits of both operands. The upper 64 bits are
201/// copied from the upper 64 bits of the first source operand.
202static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
203 __m128d __b) {
204 __a[0] /= __b[0];
205 return __a;
206}
207
208/// Performs an element-by-element division of two 128-bit vectors of
209/// [2 x double].
210///
211/// \headerfile <x86intrin.h>
212///
213/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
214///
215/// \param __a
216/// A 128-bit vector of [2 x double] containing the dividend.
217/// \param __b
218/// A 128-bit vector of [2 x double] containing the divisor.
219/// \returns A 128-bit vector of [2 x double] containing the quotients of both
220/// operands.
221static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
222 __m128d __b) {
223 return (__m128d)((__v2df)__a / (__v2df)__b);
224}
225
226/// Calculates the square root of the lower double-precision value of
227/// the second operand and returns it in the lower 64 bits of the result.
228/// The upper 64 bits of the result are copied from the upper
229/// double-precision value of the first operand.
230///
231/// \headerfile <x86intrin.h>
232///
233/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
234///
235/// \param __a
236/// A 128-bit vector of [2 x double] containing one of the operands. The
237/// upper 64 bits of this operand are copied to the upper 64 bits of the
238/// result.
239/// \param __b
240/// A 128-bit vector of [2 x double] containing one of the operands. The
241/// square root is calculated using the lower 64 bits of this operand.
242/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
243/// square root of the lower 64 bits of operand \a __b, and whose upper 64
244/// bits are copied from the upper 64 bits of operand \a __a.
245static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
246 __m128d __b) {
247 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
248 return __extension__(__m128d){__c[0], __a[1]};
249}
250
251/// Calculates the square root of the each of two values stored in a
252/// 128-bit vector of [2 x double].
253///
254/// \headerfile <x86intrin.h>
255///
256/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
257///
258/// \param __a
259/// A 128-bit vector of [2 x double].
260/// \returns A 128-bit vector of [2 x double] containing the square roots of the
261/// values in the operand.
262static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
263 return __builtin_ia32_sqrtpd((__v2df)__a);
264}
265
266/// Compares lower 64-bit double-precision values of both operands, and
267/// returns the lesser of the pair of values in the lower 64-bits of the
268/// result. The upper 64 bits of the result are copied from the upper
269/// double-precision value of the first operand.
270///
271/// If either value in a comparison is NaN, returns the value from \a __b.
272///
273/// \headerfile <x86intrin.h>
274///
275/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
276///
277/// \param __a
278/// A 128-bit vector of [2 x double] containing one of the operands. The
279/// lower 64 bits of this operand are used in the comparison.
280/// \param __b
281/// A 128-bit vector of [2 x double] containing one of the operands. The
282/// lower 64 bits of this operand are used in the comparison.
283/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
284/// minimum value between both operands. The upper 64 bits are copied from
285/// the upper 64 bits of the first source operand.
286static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
287 __m128d __b) {
288 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
289}
290
291/// Performs element-by-element comparison of the two 128-bit vectors of
292/// [2 x double] and returns a vector containing the lesser of each pair of
293/// values.
294///
295/// If either value in a comparison is NaN, returns the value from \a __b.
296///
297/// \headerfile <x86intrin.h>
298///
299/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
300///
301/// \param __a
302/// A 128-bit vector of [2 x double] containing one of the operands.
303/// \param __b
304/// A 128-bit vector of [2 x double] containing one of the operands.
305/// \returns A 128-bit vector of [2 x double] containing the minimum values
306/// between both operands.
307static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
308 __m128d __b) {
309 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
310}
311
312/// Compares lower 64-bit double-precision values of both operands, and
313/// returns the greater of the pair of values in the lower 64-bits of the
314/// result. The upper 64 bits of the result are copied from the upper
315/// double-precision value of the first operand.
316///
317/// If either value in a comparison is NaN, returns the value from \a __b.
318///
319/// \headerfile <x86intrin.h>
320///
321/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
322///
323/// \param __a
324/// A 128-bit vector of [2 x double] containing one of the operands. The
325/// lower 64 bits of this operand are used in the comparison.
326/// \param __b
327/// A 128-bit vector of [2 x double] containing one of the operands. The
328/// lower 64 bits of this operand are used in the comparison.
329/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
330/// maximum value between both operands. The upper 64 bits are copied from
331/// the upper 64 bits of the first source operand.
332static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
333 __m128d __b) {
334 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
335}
336
337/// Performs element-by-element comparison of the two 128-bit vectors of
338/// [2 x double] and returns a vector containing the greater of each pair
339/// of values.
340///
341/// If either value in a comparison is NaN, returns the value from \a __b.
342///
343/// \headerfile <x86intrin.h>
344///
345/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
346///
347/// \param __a
348/// A 128-bit vector of [2 x double] containing one of the operands.
349/// \param __b
350/// A 128-bit vector of [2 x double] containing one of the operands.
351/// \returns A 128-bit vector of [2 x double] containing the maximum values
352/// between both operands.
353static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
354 __m128d __b) {
355 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
356}
357
358/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
359///
360/// \headerfile <x86intrin.h>
361///
362/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
363///
364/// \param __a
365/// A 128-bit vector of [2 x double] containing one of the source operands.
366/// \param __b
367/// A 128-bit vector of [2 x double] containing one of the source operands.
368/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
369/// values between both operands.
370static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
371 __m128d __b) {
372 return (__m128d)((__v2du)__a & (__v2du)__b);
373}
374
375/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
376/// the one's complement of the values contained in the first source operand.
377///
378/// \headerfile <x86intrin.h>
379///
380/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
381///
382/// \param __a
383/// A 128-bit vector of [2 x double] containing the left source operand. The
384/// one's complement of this value is used in the bitwise AND.
385/// \param __b
386/// A 128-bit vector of [2 x double] containing the right source operand.
387/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
388/// values in the second operand and the one's complement of the first
389/// operand.
390static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
391 __m128d __b) {
392 return (__m128d)(~(__v2du)__a & (__v2du)__b);
393}
394
395/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
396///
397/// \headerfile <x86intrin.h>
398///
399/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
400///
401/// \param __a
402/// A 128-bit vector of [2 x double] containing one of the source operands.
403/// \param __b
404/// A 128-bit vector of [2 x double] containing one of the source operands.
405/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
406/// values between both operands.
407static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
408 __m128d __b) {
409 return (__m128d)((__v2du)__a | (__v2du)__b);
410}
411
412/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
413///
414/// \headerfile <x86intrin.h>
415///
416/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
417///
418/// \param __a
419/// A 128-bit vector of [2 x double] containing one of the source operands.
420/// \param __b
421/// A 128-bit vector of [2 x double] containing one of the source operands.
422/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
423/// values between both operands.
424static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
425 __m128d __b) {
426 return (__m128d)((__v2du)__a ^ (__v2du)__b);
427}
428
429/// Compares each of the corresponding double-precision values of the
430/// 128-bit vectors of [2 x double] for equality.
431///
432/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
433/// If either value in a comparison is NaN, returns false.
434///
435/// \headerfile <x86intrin.h>
436///
437/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
438///
439/// \param __a
440/// A 128-bit vector of [2 x double].
441/// \param __b
442/// A 128-bit vector of [2 x double].
443/// \returns A 128-bit vector containing the comparison results.
444static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
445 __m128d __b) {
446 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
447}
448
449/// Compares each of the corresponding double-precision values of the
450/// 128-bit vectors of [2 x double] to determine if the values in the first
451/// operand are less than those in the second operand.
452///
453/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454/// If either value in a comparison is NaN, returns false.
455///
456/// \headerfile <x86intrin.h>
457///
458/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
459///
460/// \param __a
461/// A 128-bit vector of [2 x double].
462/// \param __b
463/// A 128-bit vector of [2 x double].
464/// \returns A 128-bit vector containing the comparison results.
465static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
466 __m128d __b) {
467 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
468}
469
470/// Compares each of the corresponding double-precision values of the
471/// 128-bit vectors of [2 x double] to determine if the values in the first
472/// operand are less than or equal to those in the second operand.
473///
474/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
475/// If either value in a comparison is NaN, returns false.
476///
477/// \headerfile <x86intrin.h>
478///
479/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
480///
481/// \param __a
482/// A 128-bit vector of [2 x double].
483/// \param __b
484/// A 128-bit vector of [2 x double].
485/// \returns A 128-bit vector containing the comparison results.
486static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
487 __m128d __b) {
488 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
489}
490
491/// Compares each of the corresponding double-precision values of the
492/// 128-bit vectors of [2 x double] to determine if the values in the first
493/// operand are greater than those in the second operand.
494///
495/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
496/// If either value in a comparison is NaN, returns false.
497///
498/// \headerfile <x86intrin.h>
499///
500/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
501///
502/// \param __a
503/// A 128-bit vector of [2 x double].
504/// \param __b
505/// A 128-bit vector of [2 x double].
506/// \returns A 128-bit vector containing the comparison results.
507static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
508 __m128d __b) {
509 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
510}
511
512/// Compares each of the corresponding double-precision values of the
513/// 128-bit vectors of [2 x double] to determine if the values in the first
514/// operand are greater than or equal to those in the second operand.
515///
516/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
517/// If either value in a comparison is NaN, returns false.
518///
519/// \headerfile <x86intrin.h>
520///
521/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
522///
523/// \param __a
524/// A 128-bit vector of [2 x double].
525/// \param __b
526/// A 128-bit vector of [2 x double].
527/// \returns A 128-bit vector containing the comparison results.
528static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
529 __m128d __b) {
530 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
531}
532
533/// Compares each of the corresponding double-precision values of the
534/// 128-bit vectors of [2 x double] to determine if the values in the first
535/// operand are ordered with respect to those in the second operand.
536///
537/// A pair of double-precision values are ordered with respect to each
538/// other if neither value is a NaN. Each comparison returns 0x0 for false,
539/// 0xFFFFFFFFFFFFFFFF for true.
540///
541/// \headerfile <x86intrin.h>
542///
543/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
544///
545/// \param __a
546/// A 128-bit vector of [2 x double].
547/// \param __b
548/// A 128-bit vector of [2 x double].
549/// \returns A 128-bit vector containing the comparison results.
550static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
551 __m128d __b) {
552 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
553}
554
555/// Compares each of the corresponding double-precision values of the
556/// 128-bit vectors of [2 x double] to determine if the values in the first
557/// operand are unordered with respect to those in the second operand.
558///
559/// A pair of double-precision values are unordered with respect to each
560/// other if one or both values are NaN. Each comparison returns 0x0 for
561/// false, 0xFFFFFFFFFFFFFFFF for true.
562///
563/// \headerfile <x86intrin.h>
564///
565/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
566/// instruction.
567///
568/// \param __a
569/// A 128-bit vector of [2 x double].
570/// \param __b
571/// A 128-bit vector of [2 x double].
572/// \returns A 128-bit vector containing the comparison results.
573static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
574 __m128d __b) {
575 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
576}
577
578/// Compares each of the corresponding double-precision values of the
579/// 128-bit vectors of [2 x double] to determine if the values in the first
580/// operand are unequal to those in the second operand.
581///
582/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
583/// If either value in a comparison is NaN, returns true.
584///
585/// \headerfile <x86intrin.h>
586///
587/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
588///
589/// \param __a
590/// A 128-bit vector of [2 x double].
591/// \param __b
592/// A 128-bit vector of [2 x double].
593/// \returns A 128-bit vector containing the comparison results.
594static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
595 __m128d __b) {
596 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
597}
598
599/// Compares each of the corresponding double-precision values of the
600/// 128-bit vectors of [2 x double] to determine if the values in the first
601/// operand are not less than those in the second operand.
602///
603/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
604/// If either value in a comparison is NaN, returns true.
605///
606/// \headerfile <x86intrin.h>
607///
608/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
609///
610/// \param __a
611/// A 128-bit vector of [2 x double].
612/// \param __b
613/// A 128-bit vector of [2 x double].
614/// \returns A 128-bit vector containing the comparison results.
615static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
616 __m128d __b) {
617 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
618}
619
620/// Compares each of the corresponding double-precision values of the
621/// 128-bit vectors of [2 x double] to determine if the values in the first
622/// operand are not less than or equal to those in the second operand.
623///
624/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
625/// If either value in a comparison is NaN, returns true.
626///
627/// \headerfile <x86intrin.h>
628///
629/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
630///
631/// \param __a
632/// A 128-bit vector of [2 x double].
633/// \param __b
634/// A 128-bit vector of [2 x double].
635/// \returns A 128-bit vector containing the comparison results.
636static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
637 __m128d __b) {
638 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
639}
640
641/// Compares each of the corresponding double-precision values of the
642/// 128-bit vectors of [2 x double] to determine if the values in the first
643/// operand are not greater than those in the second operand.
644///
645/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
646/// If either value in a comparison is NaN, returns true.
647///
648/// \headerfile <x86intrin.h>
649///
650/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
651///
652/// \param __a
653/// A 128-bit vector of [2 x double].
654/// \param __b
655/// A 128-bit vector of [2 x double].
656/// \returns A 128-bit vector containing the comparison results.
657static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
658 __m128d __b) {
659 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
660}
661
662/// Compares each of the corresponding double-precision values of the
663/// 128-bit vectors of [2 x double] to determine if the values in the first
664/// operand are not greater than or equal to those in the second operand.
665///
666/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
667/// If either value in a comparison is NaN, returns true.
668///
669/// \headerfile <x86intrin.h>
670///
671/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
672///
673/// \param __a
674/// A 128-bit vector of [2 x double].
675/// \param __b
676/// A 128-bit vector of [2 x double].
677/// \returns A 128-bit vector containing the comparison results.
678static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
679 __m128d __b) {
680 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
681}
682
683/// Compares the lower double-precision floating-point values in each of
684/// the two 128-bit floating-point vectors of [2 x double] for equality.
685///
686/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
687/// If either value in a comparison is NaN, returns false.
688///
689/// \headerfile <x86intrin.h>
690///
691/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
692///
693/// \param __a
694/// A 128-bit vector of [2 x double]. The lower double-precision value is
695/// compared to the lower double-precision value of \a __b.
696/// \param __b
697/// A 128-bit vector of [2 x double]. The lower double-precision value is
698/// compared to the lower double-precision value of \a __a.
699/// \returns A 128-bit vector. The lower 64 bits contains the comparison
700/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
701static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
702 __m128d __b) {
703 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
704}
705
706/// Compares the lower double-precision floating-point values in each of
707/// the two 128-bit floating-point vectors of [2 x double] to determine if
708/// the value in the first parameter is less than the corresponding value in
709/// the second parameter.
710///
711/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
712/// If either value in a comparison is NaN, returns false.
713///
714/// \headerfile <x86intrin.h>
715///
716/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
717///
718/// \param __a
719/// A 128-bit vector of [2 x double]. The lower double-precision value is
720/// compared to the lower double-precision value of \a __b.
721/// \param __b
722/// A 128-bit vector of [2 x double]. The lower double-precision value is
723/// compared to the lower double-precision value of \a __a.
724/// \returns A 128-bit vector. The lower 64 bits contains the comparison
725/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
726static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
727 __m128d __b) {
728 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
729}
730
731/// Compares the lower double-precision floating-point values in each of
732/// the two 128-bit floating-point vectors of [2 x double] to determine if
733/// the value in the first parameter is less than or equal to the
734/// corresponding value in the second parameter.
735///
736/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
737/// If either value in a comparison is NaN, returns false.
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
742///
743/// \param __a
744/// A 128-bit vector of [2 x double]. The lower double-precision value is
745/// compared to the lower double-precision value of \a __b.
746/// \param __b
747/// A 128-bit vector of [2 x double]. The lower double-precision value is
748/// compared to the lower double-precision value of \a __a.
749/// \returns A 128-bit vector. The lower 64 bits contains the comparison
750/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
751static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
752 __m128d __b) {
753 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
754}
755
756/// Compares the lower double-precision floating-point values in each of
757/// the two 128-bit floating-point vectors of [2 x double] to determine if
758/// the value in the first parameter is greater than the corresponding value
759/// in the second parameter.
760///
761/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
762/// If either value in a comparison is NaN, returns false.
763///
764/// \headerfile <x86intrin.h>
765///
766/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
767///
768/// \param __a
769/// A 128-bit vector of [2 x double]. The lower double-precision value is
770/// compared to the lower double-precision value of \a __b.
771/// \param __b
772/// A 128-bit vector of [2 x double]. The lower double-precision value is
773/// compared to the lower double-precision value of \a __a.
774/// \returns A 128-bit vector. The lower 64 bits contains the comparison
775/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
776static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
777 __m128d __b) {
778 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
779 return __extension__(__m128d){__c[0], __a[1]};
780}
781
782/// Compares the lower double-precision floating-point values in each of
783/// the two 128-bit floating-point vectors of [2 x double] to determine if
784/// the value in the first parameter is greater than or equal to the
785/// corresponding value in the second parameter.
786///
787/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
788/// If either value in a comparison is NaN, returns false.
789///
790/// \headerfile <x86intrin.h>
791///
792/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
793///
794/// \param __a
795/// A 128-bit vector of [2 x double]. The lower double-precision value is
796/// compared to the lower double-precision value of \a __b.
797/// \param __b
798/// A 128-bit vector of [2 x double]. The lower double-precision value is
799/// compared to the lower double-precision value of \a __a.
800/// \returns A 128-bit vector. The lower 64 bits contains the comparison
801/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
802static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
803 __m128d __b) {
804 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
805 return __extension__(__m128d){__c[0], __a[1]};
806}
807
808/// Compares the lower double-precision floating-point values in each of
809/// the two 128-bit floating-point vectors of [2 x double] to determine if
810/// the value in the first parameter is ordered with respect to the
811/// corresponding value in the second parameter.
812///
813/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
814/// of double-precision values are ordered with respect to each other if
815/// neither value is a NaN.
816///
817/// \headerfile <x86intrin.h>
818///
819/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
820///
821/// \param __a
822/// A 128-bit vector of [2 x double]. The lower double-precision value is
823/// compared to the lower double-precision value of \a __b.
824/// \param __b
825/// A 128-bit vector of [2 x double]. The lower double-precision value is
826/// compared to the lower double-precision value of \a __a.
827/// \returns A 128-bit vector. The lower 64 bits contains the comparison
828/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
829static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
830 __m128d __b) {
831 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
832}
833
834/// Compares the lower double-precision floating-point values in each of
835/// the two 128-bit floating-point vectors of [2 x double] to determine if
836/// the value in the first parameter is unordered with respect to the
837/// corresponding value in the second parameter.
838///
839/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
840/// of double-precision values are unordered with respect to each other if
841/// one or both values are NaN.
842///
843/// \headerfile <x86intrin.h>
844///
845/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
846/// instruction.
847///
848/// \param __a
849/// A 128-bit vector of [2 x double]. The lower double-precision value is
850/// compared to the lower double-precision value of \a __b.
851/// \param __b
852/// A 128-bit vector of [2 x double]. The lower double-precision value is
853/// compared to the lower double-precision value of \a __a.
854/// \returns A 128-bit vector. The lower 64 bits contains the comparison
855/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
856static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
857 __m128d __b) {
858 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
859}
860
861/// Compares the lower double-precision floating-point values in each of
862/// the two 128-bit floating-point vectors of [2 x double] to determine if
863/// the value in the first parameter is unequal to the corresponding value in
864/// the second parameter.
865///
866/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
867/// If either value in a comparison is NaN, returns true.
868///
869/// \headerfile <x86intrin.h>
870///
871/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
872///
873/// \param __a
874/// A 128-bit vector of [2 x double]. The lower double-precision value is
875/// compared to the lower double-precision value of \a __b.
876/// \param __b
877/// A 128-bit vector of [2 x double]. The lower double-precision value is
878/// compared to the lower double-precision value of \a __a.
879/// \returns A 128-bit vector. The lower 64 bits contains the comparison
880/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
881static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
882 __m128d __b) {
883 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
884}
885
886/// Compares the lower double-precision floating-point values in each of
887/// the two 128-bit floating-point vectors of [2 x double] to determine if
888/// the value in the first parameter is not less than the corresponding
889/// value in the second parameter.
890///
891/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
892/// If either value in a comparison is NaN, returns true.
893///
894/// \headerfile <x86intrin.h>
895///
896/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
897///
898/// \param __a
899/// A 128-bit vector of [2 x double]. The lower double-precision value is
900/// compared to the lower double-precision value of \a __b.
901/// \param __b
902/// A 128-bit vector of [2 x double]. The lower double-precision value is
903/// compared to the lower double-precision value of \a __a.
904/// \returns A 128-bit vector. The lower 64 bits contains the comparison
905/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
906static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
907 __m128d __b) {
908 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
909}
910
911/// Compares the lower double-precision floating-point values in each of
912/// the two 128-bit floating-point vectors of [2 x double] to determine if
913/// the value in the first parameter is not less than or equal to the
914/// corresponding value in the second parameter.
915///
916/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
917/// If either value in a comparison is NaN, returns true.
918///
919/// \headerfile <x86intrin.h>
920///
921/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
922///
923/// \param __a
924/// A 128-bit vector of [2 x double]. The lower double-precision value is
925/// compared to the lower double-precision value of \a __b.
926/// \param __b
927/// A 128-bit vector of [2 x double]. The lower double-precision value is
928/// compared to the lower double-precision value of \a __a.
929/// \returns A 128-bit vector. The lower 64 bits contains the comparison
930/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
931static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
932 __m128d __b) {
933 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
934}
935
936/// Compares the lower double-precision floating-point values in each of
937/// the two 128-bit floating-point vectors of [2 x double] to determine if
938/// the value in the first parameter is not greater than the corresponding
939/// value in the second parameter.
940///
941/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
942/// If either value in a comparison is NaN, returns true.
943///
944/// \headerfile <x86intrin.h>
945///
946/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
947///
948/// \param __a
949/// A 128-bit vector of [2 x double]. The lower double-precision value is
950/// compared to the lower double-precision value of \a __b.
951/// \param __b
952/// A 128-bit vector of [2 x double]. The lower double-precision value is
953/// compared to the lower double-precision value of \a __a.
954/// \returns A 128-bit vector. The lower 64 bits contains the comparison
955/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
956static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
957 __m128d __b) {
958 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
959 return __extension__(__m128d){__c[0], __a[1]};
960}
961
962/// Compares the lower double-precision floating-point values in each of
963/// the two 128-bit floating-point vectors of [2 x double] to determine if
964/// the value in the first parameter is not greater than or equal to the
965/// corresponding value in the second parameter.
966///
967/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
968/// If either value in a comparison is NaN, returns true.
969///
970/// \headerfile <x86intrin.h>
971///
972/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
973///
974/// \param __a
975/// A 128-bit vector of [2 x double]. The lower double-precision value is
976/// compared to the lower double-precision value of \a __b.
977/// \param __b
978/// A 128-bit vector of [2 x double]. The lower double-precision value is
979/// compared to the lower double-precision value of \a __a.
980/// \returns A 128-bit vector. The lower 64 bits contains the comparison
981/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
982static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
983 __m128d __b) {
984 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
985 return __extension__(__m128d){__c[0], __a[1]};
986}
987
988/// Compares the lower double-precision floating-point values in each of
989/// the two 128-bit floating-point vectors of [2 x double] for equality.
990///
991/// The comparison returns 0 for false, 1 for true. If either value in a
992/// comparison is NaN, returns 0.
993///
994/// \headerfile <x86intrin.h>
995///
996/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
997///
998/// \param __a
999/// A 128-bit vector of [2 x double]. The lower double-precision value is
1000/// compared to the lower double-precision value of \a __b.
1001/// \param __b
1002/// A 128-bit vector of [2 x double]. The lower double-precision value is
1003/// compared to the lower double-precision value of \a __a.
1004/// \returns An integer containing the comparison results.
1005static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1006 __m128d __b) {
1007 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1008}
1009
1010/// Compares the lower double-precision floating-point values in each of
1011/// the two 128-bit floating-point vectors of [2 x double] to determine if
1012/// the value in the first parameter is less than the corresponding value in
1013/// the second parameter.
1014///
1015/// The comparison returns 0 for false, 1 for true. If either value in a
1016/// comparison is NaN, returns 0.
1017///
1018/// \headerfile <x86intrin.h>
1019///
1020/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1021///
1022/// \param __a
1023/// A 128-bit vector of [2 x double]. The lower double-precision value is
1024/// compared to the lower double-precision value of \a __b.
1025/// \param __b
1026/// A 128-bit vector of [2 x double]. The lower double-precision value is
1027/// compared to the lower double-precision value of \a __a.
1028/// \returns An integer containing the comparison results.
1029static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1030 __m128d __b) {
1031 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1032}
1033
1034/// Compares the lower double-precision floating-point values in each of
1035/// the two 128-bit floating-point vectors of [2 x double] to determine if
1036/// the value in the first parameter is less than or equal to the
1037/// corresponding value in the second parameter.
1038///
1039/// The comparison returns 0 for false, 1 for true. If either value in a
1040/// comparison is NaN, returns 0.
1041///
1042/// \headerfile <x86intrin.h>
1043///
1044/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1045///
1046/// \param __a
1047/// A 128-bit vector of [2 x double]. The lower double-precision value is
1048/// compared to the lower double-precision value of \a __b.
1049/// \param __b
1050/// A 128-bit vector of [2 x double]. The lower double-precision value is
1051/// compared to the lower double-precision value of \a __a.
1052/// \returns An integer containing the comparison results.
1053static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1054 __m128d __b) {
1055 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1056}
1057
1058/// Compares the lower double-precision floating-point values in each of
1059/// the two 128-bit floating-point vectors of [2 x double] to determine if
1060/// the value in the first parameter is greater than the corresponding value
1061/// in the second parameter.
1062///
1063/// The comparison returns 0 for false, 1 for true. If either value in a
1064/// comparison is NaN, returns 0.
1065///
1066/// \headerfile <x86intrin.h>
1067///
1068/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1069///
1070/// \param __a
1071/// A 128-bit vector of [2 x double]. The lower double-precision value is
1072/// compared to the lower double-precision value of \a __b.
1073/// \param __b
1074/// A 128-bit vector of [2 x double]. The lower double-precision value is
1075/// compared to the lower double-precision value of \a __a.
1076/// \returns An integer containing the comparison results.
1077static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1078 __m128d __b) {
1079 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1080}
1081
1082/// Compares the lower double-precision floating-point values in each of
1083/// the two 128-bit floating-point vectors of [2 x double] to determine if
1084/// the value in the first parameter is greater than or equal to the
1085/// corresponding value in the second parameter.
1086///
1087/// The comparison returns 0 for false, 1 for true. If either value in a
1088/// comparison is NaN, returns 0.
1089///
1090/// \headerfile <x86intrin.h>
1091///
1092/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1093///
1094/// \param __a
1095/// A 128-bit vector of [2 x double]. The lower double-precision value is
1096/// compared to the lower double-precision value of \a __b.
1097/// \param __b
1098/// A 128-bit vector of [2 x double]. The lower double-precision value is
1099/// compared to the lower double-precision value of \a __a.
1100/// \returns An integer containing the comparison results.
1101static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1102 __m128d __b) {
1103 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1104}
1105
1106/// Compares the lower double-precision floating-point values in each of
1107/// the two 128-bit floating-point vectors of [2 x double] to determine if
1108/// the value in the first parameter is unequal to the corresponding value in
1109/// the second parameter.
1110///
1111/// The comparison returns 0 for false, 1 for true. If either value in a
1112/// comparison is NaN, returns 1.
1113///
1114/// \headerfile <x86intrin.h>
1115///
1116/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1117///
1118/// \param __a
1119/// A 128-bit vector of [2 x double]. The lower double-precision value is
1120/// compared to the lower double-precision value of \a __b.
1121/// \param __b
1122/// A 128-bit vector of [2 x double]. The lower double-precision value is
1123/// compared to the lower double-precision value of \a __a.
1124/// \returns An integer containing the comparison results.
1125static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1126 __m128d __b) {
1127 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1128}
1129
1130/// Compares the lower double-precision floating-point values in each of
1131/// the two 128-bit floating-point vectors of [2 x double] for equality.
1132///
1133/// The comparison returns 0 for false, 1 for true. If either value in a
1134/// comparison is NaN, returns 0.
1135///
1136/// \headerfile <x86intrin.h>
1137///
1138/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1139///
1140/// \param __a
1141/// A 128-bit vector of [2 x double]. The lower double-precision value is
1142/// compared to the lower double-precision value of \a __b.
1143/// \param __b
1144/// A 128-bit vector of [2 x double]. The lower double-precision value is
1145/// compared to the lower double-precision value of \a __a.
1146/// \returns An integer containing the comparison results.
1147static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1148 __m128d __b) {
1149 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1150}
1151
1152/// Compares the lower double-precision floating-point values in each of
1153/// the two 128-bit floating-point vectors of [2 x double] to determine if
1154/// the value in the first parameter is less than the corresponding value in
1155/// the second parameter.
1156///
1157/// The comparison returns 0 for false, 1 for true. If either value in a
1158/// comparison is NaN, returns 0.
1159///
1160/// \headerfile <x86intrin.h>
1161///
1162/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1163///
1164/// \param __a
1165/// A 128-bit vector of [2 x double]. The lower double-precision value is
1166/// compared to the lower double-precision value of \a __b.
1167/// \param __b
1168/// A 128-bit vector of [2 x double]. The lower double-precision value is
1169/// compared to the lower double-precision value of \a __a.
1170/// \returns An integer containing the comparison results.
1171static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1172 __m128d __b) {
1173 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1174}
1175
1176/// Compares the lower double-precision floating-point values in each of
1177/// the two 128-bit floating-point vectors of [2 x double] to determine if
1178/// the value in the first parameter is less than or equal to the
1179/// corresponding value in the second parameter.
1180///
1181/// The comparison returns 0 for false, 1 for true. If either value in a
1182/// comparison is NaN, returns 0.
1183///
1184/// \headerfile <x86intrin.h>
1185///
1186/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1187///
1188/// \param __a
1189/// A 128-bit vector of [2 x double]. The lower double-precision value is
1190/// compared to the lower double-precision value of \a __b.
1191/// \param __b
1192/// A 128-bit vector of [2 x double]. The lower double-precision value is
1193/// compared to the lower double-precision value of \a __a.
1194/// \returns An integer containing the comparison results.
1195static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1196 __m128d __b) {
1197 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1198}
1199
1200/// Compares the lower double-precision floating-point values in each of
1201/// the two 128-bit floating-point vectors of [2 x double] to determine if
1202/// the value in the first parameter is greater than the corresponding value
1203/// in the second parameter.
1204///
1205/// The comparison returns 0 for false, 1 for true. If either value in a
1206/// comparison is NaN, returns 0.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1211///
1212/// \param __a
1213/// A 128-bit vector of [2 x double]. The lower double-precision value is
1214/// compared to the lower double-precision value of \a __b.
1215/// \param __b
1216/// A 128-bit vector of [2 x double]. The lower double-precision value is
1217/// compared to the lower double-precision value of \a __a.
1218/// \returns An integer containing the comparison results.
1219static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1220 __m128d __b) {
1221 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1222}
1223
1224/// Compares the lower double-precision floating-point values in each of
1225/// the two 128-bit floating-point vectors of [2 x double] to determine if
1226/// the value in the first parameter is greater than or equal to the
1227/// corresponding value in the second parameter.
1228///
1229/// The comparison returns 0 for false, 1 for true. If either value in a
1230/// comparison is NaN, returns 0.
1231///
1232/// \headerfile <x86intrin.h>
1233///
1234/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1235///
1236/// \param __a
1237/// A 128-bit vector of [2 x double]. The lower double-precision value is
1238/// compared to the lower double-precision value of \a __b.
1239/// \param __b
1240/// A 128-bit vector of [2 x double]. The lower double-precision value is
1241/// compared to the lower double-precision value of \a __a.
1242/// \returns An integer containing the comparison results.
1243static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1244 __m128d __b) {
1245 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1246}
1247
1248/// Compares the lower double-precision floating-point values in each of
1249/// the two 128-bit floating-point vectors of [2 x double] to determine if
1250/// the value in the first parameter is unequal to the corresponding value in
1251/// the second parameter.
1252///
1253/// The comparison returns 0 for false, 1 for true. If either value in a
1254/// comparison is NaN, returns 1.
1255///
1256/// \headerfile <x86intrin.h>
1257///
1258/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1259///
1260/// \param __a
1261/// A 128-bit vector of [2 x double]. The lower double-precision value is
1262/// compared to the lower double-precision value of \a __b.
1263/// \param __b
1264/// A 128-bit vector of [2 x double]. The lower double-precision value is
1265/// compared to the lower double-precision value of \a __a.
1266/// \returns An integer containing the comparison result.
1267static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1268 __m128d __b) {
1269 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1270}
1271
1272/// Converts the two double-precision floating-point elements of a
1273/// 128-bit vector of [2 x double] into two single-precision floating-point
1274/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1275/// The upper 64 bits of the result vector are set to zero.
1276///
1277/// \headerfile <x86intrin.h>
1278///
1279/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1280///
1281/// \param __a
1282/// A 128-bit vector of [2 x double].
1283/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1284/// converted values. The upper 64 bits are set to zero.
1285static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1286 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1287}
1288
1289/// Converts the lower two single-precision floating-point elements of a
1290/// 128-bit vector of [4 x float] into two double-precision floating-point
1291/// values, returned in a 128-bit vector of [2 x double]. The upper two
1292/// elements of the input vector are unused.
1293///
1294/// \headerfile <x86intrin.h>
1295///
1296/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1297///
1298/// \param __a
1299/// A 128-bit vector of [4 x float]. The lower two single-precision
1300/// floating-point elements are converted to double-precision values. The
1301/// upper two elements are unused.
1302/// \returns A 128-bit vector of [2 x double] containing the converted values.
1303static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1304 return (__m128d) __builtin_convertvector(
1305 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1306}
1307
1308/// Converts the lower two integer elements of a 128-bit vector of
1309/// [4 x i32] into two double-precision floating-point values, returned in a
1310/// 128-bit vector of [2 x double].
1311///
1312/// The upper two elements of the input vector are unused.
1313///
1314/// \headerfile <x86intrin.h>
1315///
1316/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1317///
1318/// \param __a
1319/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1320/// converted to double-precision values.
1321///
1322/// The upper two elements are unused.
1323/// \returns A 128-bit vector of [2 x double] containing the converted values.
1324static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1325 return (__m128d) __builtin_convertvector(
1326 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1327}
1328
1329/// Converts the two double-precision floating-point elements of a
1330/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1331/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1332/// 64 bits of the result vector are set to zero.
1333///
1334/// If a converted value does not fit in a 32-bit integer, raises a
1335/// floating-point invalid exception. If the exception is masked, returns
1336/// the most negative integer.
1337///
1338/// \headerfile <x86intrin.h>
1339///
1340/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1341///
1342/// \param __a
1343/// A 128-bit vector of [2 x double].
1344/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1345/// converted values. The upper 64 bits are set to zero.
1346static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1347 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1348}
1349
1350/// Converts the low-order element of a 128-bit vector of [2 x double]
1351/// into a 32-bit signed integer value.
1352///
1353/// If the converted value does not fit in a 32-bit integer, raises a
1354/// floating-point invalid exception. If the exception is masked, returns
1355/// the most negative integer.
1356///
1357/// \headerfile <x86intrin.h>
1358///
1359/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1360///
1361/// \param __a
1362/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1363/// conversion.
1364/// \returns A 32-bit signed integer containing the converted value.
1365static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1366 return __builtin_ia32_cvtsd2si((__v2df)__a);
1367}
1368
1369/// Converts the lower double-precision floating-point element of a
1370/// 128-bit vector of [2 x double], in the second parameter, into a
1371/// single-precision floating-point value, returned in the lower 32 bits of a
1372/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1373/// copied from the upper 96 bits of the first parameter.
1374///
1375/// \headerfile <x86intrin.h>
1376///
1377/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1378///
1379/// \param __a
1380/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1381/// copied to the upper 96 bits of the result.
1382/// \param __b
1383/// A 128-bit vector of [2 x double]. The lower double-precision
1384/// floating-point element is used in the conversion.
1385/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1386/// converted value from the second parameter. The upper 96 bits are copied
1387/// from the upper 96 bits of the first parameter.
1388static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1389 __m128d __b) {
1390 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1391}
1392
1393/// Converts a 32-bit signed integer value, in the second parameter, into
1394/// a double-precision floating-point value, returned in the lower 64 bits of
1395/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1396/// are copied from the upper 64 bits of the first parameter.
1397///
1398/// \headerfile <x86intrin.h>
1399///
1400/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1401///
1402/// \param __a
1403/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1404/// copied to the upper 64 bits of the result.
1405/// \param __b
1406/// A 32-bit signed integer containing the value to be converted.
1407/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1408/// converted value from the second parameter. The upper 64 bits are copied
1409/// from the upper 64 bits of the first parameter.
1410static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1411 int __b) {
1412 __a[0] = __b;
1413 return __a;
1414}
1415
1416/// Converts the lower single-precision floating-point element of a
1417/// 128-bit vector of [4 x float], in the second parameter, into a
1418/// double-precision floating-point value, returned in the lower 64 bits of
1419/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1420/// are copied from the upper 64 bits of the first parameter.
1421///
1422/// \headerfile <x86intrin.h>
1423///
1424/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1425///
1426/// \param __a
1427/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1428/// copied to the upper 64 bits of the result.
1429/// \param __b
1430/// A 128-bit vector of [4 x float]. The lower single-precision
1431/// floating-point element is used in the conversion.
1432/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1433/// converted value from the second parameter. The upper 64 bits are copied
1434/// from the upper 64 bits of the first parameter.
1435static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1436 __m128 __b) {
1437 __a[0] = __b[0];
1438 return __a;
1439}
1440
1441/// Converts the two double-precision floating-point elements of a
1442/// 128-bit vector of [2 x double] into two signed truncated (rounded
1443/// toward zero) 32-bit integer values, returned in the lower 64 bits
1444/// of a 128-bit vector of [4 x i32].
1445///
1446/// If a converted value does not fit in a 32-bit integer, raises a
1447/// floating-point invalid exception. If the exception is masked, returns
1448/// the most negative integer.
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1453/// instruction.
1454///
1455/// \param __a
1456/// A 128-bit vector of [2 x double].
1457/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1458/// converted values. The upper 64 bits are set to zero.
1459static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1460 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1461}
1462
1463/// Converts the low-order element of a [2 x double] vector into a 32-bit
1464/// signed truncated (rounded toward zero) integer value.
1465///
1466/// If the converted value does not fit in a 32-bit integer, raises a
1467/// floating-point invalid exception. If the exception is masked, returns
1468/// the most negative integer.
1469///
1470/// \headerfile <x86intrin.h>
1471///
1472/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1473/// instruction.
1474///
1475/// \param __a
1476/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1477/// conversion.
1478/// \returns A 32-bit signed integer containing the converted value.
1479static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1480 return __builtin_ia32_cvttsd2si((__v2df)__a);
1481}
1482
1483/// Converts the two double-precision floating-point elements of a
1484/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1485/// returned in a 64-bit vector of [2 x i32].
1486///
1487/// If a converted value does not fit in a 32-bit integer, raises a
1488/// floating-point invalid exception. If the exception is masked, returns
1489/// the most negative integer.
1490///
1491/// \headerfile <x86intrin.h>
1492///
1493/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1494///
1495/// \param __a
1496/// A 128-bit vector of [2 x double].
1497/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1498static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1499 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1500}
1501
1502/// Converts the two double-precision floating-point elements of a
1503/// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1504/// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1505///
1506/// If a converted value does not fit in a 32-bit integer, raises a
1507/// floating-point invalid exception. If the exception is masked, returns
1508/// the most negative integer.
1509///
1510/// \headerfile <x86intrin.h>
1511///
1512/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1513///
1514/// \param __a
1515/// A 128-bit vector of [2 x double].
1516/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1517static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1518 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1519}
1520
1521/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1522/// [2 x i32] into two double-precision floating-point values, returned in a
1523/// 128-bit vector of [2 x double].
1524///
1525/// \headerfile <x86intrin.h>
1526///
1527/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1528///
1529/// \param __a
1530/// A 64-bit vector of [2 x i32].
1531/// \returns A 128-bit vector of [2 x double] containing the converted values.
1532static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
1533 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1534}
1535
1536/// Returns the low-order element of a 128-bit vector of [2 x double] as
1537/// a double-precision floating-point value.
1538///
1539/// \headerfile <x86intrin.h>
1540///
1541/// This intrinsic has no corresponding instruction.
1542///
1543/// \param __a
1544/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1545/// \returns A double-precision floating-point value copied from the lower 64
1546/// bits of \a __a.
1547static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1548 return __a[0];
1549}
1550
1551/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1552/// memory location.
1553///
1554/// \headerfile <x86intrin.h>
1555///
1556/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1557///
1558/// \param __dp
1559/// A pointer to a 128-bit memory location. The address of the memory
1560/// location has to be 16-byte aligned.
1561/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1562static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1563 return *(const __m128d *)__dp;
1564}
1565
1566/// Loads a double-precision floating-point value from a specified memory
1567/// location and duplicates it to both vector elements of a 128-bit vector of
1568/// [2 x double].
1569///
1570/// \headerfile <x86intrin.h>
1571///
1572/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1573///
1574/// \param __dp
1575/// A pointer to a memory location containing a double-precision value.
1576/// \returns A 128-bit vector of [2 x double] containing the loaded and
1577/// duplicated values.
1578static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1579 struct __mm_load1_pd_struct {
1580 double __u;
1581 } __attribute__((__packed__, __may_alias__));
1582 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1583 return __extension__(__m128d){__u, __u};
1584}
1585
1586#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1587
1588/// Loads two double-precision values, in reverse order, from an aligned
1589/// memory location into a 128-bit vector of [2 x double].
1590///
1591/// \headerfile <x86intrin.h>
1592///
1593/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1594/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1595/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1596///
1597/// \param __dp
1598/// A 16-byte aligned pointer to an array of double-precision values to be
1599/// loaded in reverse order.
1600/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1601/// values.
1602static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1603 __m128d __u = *(const __m128d *)__dp;
1604 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1605}
1606
1607/// Loads a 128-bit floating-point vector of [2 x double] from an
1608/// unaligned memory location.
1609///
1610/// \headerfile <x86intrin.h>
1611///
1612/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1613///
1614/// \param __dp
1615/// A pointer to a 128-bit memory location. The address of the memory
1616/// location does not have to be aligned.
1617/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1618static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1619 struct __loadu_pd {
1620 __m128d_u __v;
1621 } __attribute__((__packed__, __may_alias__));
1622 return ((const struct __loadu_pd *)__dp)->__v;
1623}
1624
1625/// Loads a 64-bit integer value to the low element of a 128-bit integer
1626/// vector and clears the upper element.
1627///
1628/// \headerfile <x86intrin.h>
1629///
1630/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1631///
1632/// \param __a
1633/// A pointer to a 64-bit memory location. The address of the memory
1634/// location does not have to be aligned.
1635/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1636static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1637 struct __loadu_si64 {
1638 long long __v;
1639 } __attribute__((__packed__, __may_alias__));
1640 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1641 return __extension__(__m128i)(__v2di){__u, 0LL};
1642}
1643
1644/// Loads a 32-bit integer value to the low element of a 128-bit integer
1645/// vector and clears the upper element.
1646///
1647/// \headerfile <x86intrin.h>
1648///
1649/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1650///
1651/// \param __a
1652/// A pointer to a 32-bit memory location. The address of the memory
1653/// location does not have to be aligned.
1654/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1655static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1656 struct __loadu_si32 {
1657 int __v;
1658 } __attribute__((__packed__, __may_alias__));
1659 int __u = ((const struct __loadu_si32 *)__a)->__v;
1660 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1661}
1662
1663/// Loads a 16-bit integer value to the low element of a 128-bit integer
1664/// vector and clears the upper element.
1665///
1666/// \headerfile <x86intrin.h>
1667///
1668/// This intrinsic does not correspond to a specific instruction.
1669///
1670/// \param __a
1671/// A pointer to a 16-bit memory location. The address of the memory
1672/// location does not have to be aligned.
1673/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1674static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1675 struct __loadu_si16 {
1676 short __v;
1677 } __attribute__((__packed__, __may_alias__));
1678 short __u = ((const struct __loadu_si16 *)__a)->__v;
1679 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1680}
1681
1682/// Loads a 64-bit double-precision value to the low element of a
1683/// 128-bit integer vector and clears the upper element.
1684///
1685/// \headerfile <x86intrin.h>
1686///
1687/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1688///
1689/// \param __dp
1690/// A pointer to a memory location containing a double-precision value.
1691/// The address of the memory location does not have to be aligned.
1692/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1693static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1694 struct __mm_load_sd_struct {
1695 double __u;
1696 } __attribute__((__packed__, __may_alias__));
1697 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1698 return __extension__(__m128d){__u, 0};
1699}
1700
1701/// Loads a double-precision value into the high-order bits of a 128-bit
1702/// vector of [2 x double]. The low-order bits are copied from the low-order
1703/// bits of the first operand.
1704///
1705/// \headerfile <x86intrin.h>
1706///
1707/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1708///
1709/// \param __a
1710/// A 128-bit vector of [2 x double]. \n
1711/// Bits [63:0] are written to bits [63:0] of the result.
1712/// \param __dp
1713/// A pointer to a 64-bit memory location containing a double-precision
1714/// floating-point value that is loaded. The loaded value is written to bits
1715/// [127:64] of the result. The address of the memory location does not have
1716/// to be aligned.
1717/// \returns A 128-bit vector of [2 x double] containing the moved values.
1718static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1719 double const *__dp) {
1720 struct __mm_loadh_pd_struct {
1721 double __u;
1722 } __attribute__((__packed__, __may_alias__));
1723 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1724 return __extension__(__m128d){__a[0], __u};
1725}
1726
1727/// Loads a double-precision value into the low-order bits of a 128-bit
1728/// vector of [2 x double]. The high-order bits are copied from the
1729/// high-order bits of the first operand.
1730///
1731/// \headerfile <x86intrin.h>
1732///
1733/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1734///
1735/// \param __a
1736/// A 128-bit vector of [2 x double]. \n
1737/// Bits [127:64] are written to bits [127:64] of the result.
1738/// \param __dp
1739/// A pointer to a 64-bit memory location containing a double-precision
1740/// floating-point value that is loaded. The loaded value is written to bits
1741/// [63:0] of the result. The address of the memory location does not have to
1742/// be aligned.
1743/// \returns A 128-bit vector of [2 x double] containing the moved values.
1744static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1745 double const *__dp) {
1746 struct __mm_loadl_pd_struct {
1747 double __u;
1748 } __attribute__((__packed__, __may_alias__));
1749 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1750 return __extension__(__m128d){__u, __a[1]};
1751}
1752
1753/// Constructs a 128-bit floating-point vector of [2 x double] with
1754/// unspecified content. This could be used as an argument to another
1755/// intrinsic function where the argument is required but the value is not
1756/// actually used.
1757///
1758/// \headerfile <x86intrin.h>
1759///
1760/// This intrinsic has no corresponding instruction.
1761///
1762/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1763/// content.
1764static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1765 return (__m128d)__builtin_ia32_undef128();
1766}
1767
1768/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1769/// 64 bits of the vector are initialized with the specified double-precision
1770/// floating-point value. The upper 64 bits are set to zero.
1771///
1772/// \headerfile <x86intrin.h>
1773///
1774/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1775///
1776/// \param __w
1777/// A double-precision floating-point value used to initialize the lower 64
1778/// bits of the result.
1779/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1780/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1781/// set to zero.
1782static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1783 return __extension__(__m128d){__w, 0.0};
1784}
1785
1786/// Constructs a 128-bit floating-point vector of [2 x double], with each
1787/// of the two double-precision floating-point vector elements set to the
1788/// specified double-precision floating-point value.
1789///
1790/// \headerfile <x86intrin.h>
1791///
1792/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1793///
1794/// \param __w
1795/// A double-precision floating-point value used to initialize each vector
1796/// element of the result.
1797/// \returns An initialized 128-bit floating-point vector of [2 x double].
1798static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1799 return __extension__(__m128d){__w, __w};
1800}
1801
1802/// Constructs a 128-bit floating-point vector of [2 x double], with each
1803/// of the two double-precision floating-point vector elements set to the
1804/// specified double-precision floating-point value.
1805///
1806/// \headerfile <x86intrin.h>
1807///
1808/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1809///
1810/// \param __w
1811/// A double-precision floating-point value used to initialize each vector
1812/// element of the result.
1813/// \returns An initialized 128-bit floating-point vector of [2 x double].
1814static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1815 return _mm_set1_pd(__w);
1816}
1817
1818/// Constructs a 128-bit floating-point vector of [2 x double]
1819/// initialized with the specified double-precision floating-point values.
1820///
1821/// \headerfile <x86intrin.h>
1822///
1823/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1824///
1825/// \param __w
1826/// A double-precision floating-point value used to initialize the upper 64
1827/// bits of the result.
1828/// \param __x
1829/// A double-precision floating-point value used to initialize the lower 64
1830/// bits of the result.
1831/// \returns An initialized 128-bit floating-point vector of [2 x double].
1832static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1833 double __x) {
1834 return __extension__(__m128d){__x, __w};
1835}
1836
1837/// Constructs a 128-bit floating-point vector of [2 x double],
1838/// initialized in reverse order with the specified double-precision
1839/// floating-point values.
1840///
1841/// \headerfile <x86intrin.h>
1842///
1843/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1844///
1845/// \param __w
1846/// A double-precision floating-point value used to initialize the lower 64
1847/// bits of the result.
1848/// \param __x
1849/// A double-precision floating-point value used to initialize the upper 64
1850/// bits of the result.
1851/// \returns An initialized 128-bit floating-point vector of [2 x double].
1852static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1853 double __x) {
1854 return __extension__(__m128d){__w, __x};
1855}
1856
1857/// Constructs a 128-bit floating-point vector of [2 x double]
1858/// initialized to zero.
1859///
1860/// \headerfile <x86intrin.h>
1861///
1862/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1863///
1864/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1865/// all elements set to zero.
1866static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1867 return __extension__(__m128d){0.0, 0.0};
1868}
1869
1870/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1871/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1872/// 64 bits are set to the upper 64 bits of the first parameter.
1873///
1874/// \headerfile <x86intrin.h>
1875///
1876/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1877///
1878/// \param __a
1879/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1880/// upper 64 bits of the result.
1881/// \param __b
1882/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1883/// lower 64 bits of the result.
1884/// \returns A 128-bit vector of [2 x double] containing the moved values.
1885static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1886 __m128d __b) {
1887 __a[0] = __b[0];
1888 return __a;
1889}
1890
1891/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1892/// memory location.
1893///
1894/// \headerfile <x86intrin.h>
1895///
1896/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1897///
1898/// \param __dp
1899/// A pointer to a 64-bit memory location.
1900/// \param __a
1901/// A 128-bit vector of [2 x double] containing the value to be stored.
1902static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1903 __m128d __a) {
1904 struct __mm_store_sd_struct {
1905 double __u;
1906 } __attribute__((__packed__, __may_alias__));
1907 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1908}
1909
1910/// Moves packed double-precision values from a 128-bit vector of
1911/// [2 x double] to a memory location.
1912///
1913/// \headerfile <x86intrin.h>
1914///
1915/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1916///
1917/// \param __dp
1918/// A pointer to an aligned memory location that can store two
1919/// double-precision values.
1920/// \param __a
1921/// A packed 128-bit vector of [2 x double] containing the values to be
1922/// moved.
1923static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1924 __m128d __a) {
1925 *(__m128d *)__dp = __a;
1926}
1927
1928/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1929/// the upper and lower 64 bits of a memory location.
1930///
1931/// \headerfile <x86intrin.h>
1932///
1933/// This intrinsic corresponds to the
1934/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1935///
1936/// \param __dp
1937/// A pointer to a memory location that can store two double-precision
1938/// values.
1939/// \param __a
1940/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1941/// of the values in \a __dp.
1942static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1943 __m128d __a) {
1944 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1945 _mm_store_pd(__dp, __a);
1946}
1947
1948/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1949/// the upper and lower 64 bits of a memory location.
1950///
1951/// \headerfile <x86intrin.h>
1952///
1953/// This intrinsic corresponds to the
1954/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1955///
1956/// \param __dp
1957/// A pointer to a memory location that can store two double-precision
1958/// values.
1959/// \param __a
1960/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1961/// of the values in \a __dp.
1962static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1963 __m128d __a) {
1964 _mm_store1_pd(__dp, __a);
1965}
1966
1967/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1968/// location.
1969///
1970/// \headerfile <x86intrin.h>
1971///
1972/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1973///
1974/// \param __dp
1975/// A pointer to a 128-bit memory location. The address of the memory
1976/// location does not have to be aligned.
1977/// \param __a
1978/// A 128-bit vector of [2 x double] containing the values to be stored.
1979static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1980 __m128d __a) {
1981 struct __storeu_pd {
1982 __m128d_u __v;
1983 } __attribute__((__packed__, __may_alias__));
1984 ((struct __storeu_pd *)__dp)->__v = __a;
1985}
1986
1987/// Stores two double-precision values, in reverse order, from a 128-bit
1988/// vector of [2 x double] to a 16-byte aligned memory location.
1989///
1990/// \headerfile <x86intrin.h>
1991///
1992/// This intrinsic corresponds to a shuffling instruction followed by a
1993/// <c> VMOVAPD / MOVAPD </c> instruction.
1994///
1995/// \param __dp
1996/// A pointer to a 16-byte aligned memory location that can store two
1997/// double-precision values.
1998/// \param __a
1999/// A 128-bit vector of [2 x double] containing the values to be reversed and
2000/// stored.
2001static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2002 __m128d __a) {
2003 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2004 *(__m128d *)__dp = __a;
2005}
2006
2007/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2008/// memory location.
2009///
2010/// \headerfile <x86intrin.h>
2011///
2012/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2013///
2014/// \param __dp
2015/// A pointer to a 64-bit memory location.
2016/// \param __a
2017/// A 128-bit vector of [2 x double] containing the value to be stored.
2018static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2019 __m128d __a) {
2020 struct __mm_storeh_pd_struct {
2021 double __u;
2022 } __attribute__((__packed__, __may_alias__));
2023 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2024}
2025
2026/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2027/// memory location.
2028///
2029/// \headerfile <x86intrin.h>
2030///
2031/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2032///
2033/// \param __dp
2034/// A pointer to a 64-bit memory location.
2035/// \param __a
2036/// A 128-bit vector of [2 x double] containing the value to be stored.
2037static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2038 __m128d __a) {
2039 struct __mm_storeh_pd_struct {
2040 double __u;
2041 } __attribute__((__packed__, __may_alias__));
2042 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2043}
2044
2045/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2046/// saving the lower 8 bits of each sum in the corresponding element of a
2047/// 128-bit result vector of [16 x i8].
2048///
2049/// The integer elements of both parameters can be either signed or unsigned.
2050///
2051/// \headerfile <x86intrin.h>
2052///
2053/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2054///
2055/// \param __a
2056/// A 128-bit vector of [16 x i8].
2057/// \param __b
2058/// A 128-bit vector of [16 x i8].
2059/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2060/// parameters.
2061static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2062 __m128i __b) {
2063 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2064}
2065
2066/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2067/// saving the lower 16 bits of each sum in the corresponding element of a
2068/// 128-bit result vector of [8 x i16].
2069///
2070/// The integer elements of both parameters can be either signed or unsigned.
2071///
2072/// \headerfile <x86intrin.h>
2073///
2074/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2075///
2076/// \param __a
2077/// A 128-bit vector of [8 x i16].
2078/// \param __b
2079/// A 128-bit vector of [8 x i16].
2080/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2081/// parameters.
2082static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2083 __m128i __b) {
2084 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2085}
2086
2087/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2088/// saving the lower 32 bits of each sum in the corresponding element of a
2089/// 128-bit result vector of [4 x i32].
2090///
2091/// The integer elements of both parameters can be either signed or unsigned.
2092///
2093/// \headerfile <x86intrin.h>
2094///
2095/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2096///
2097/// \param __a
2098/// A 128-bit vector of [4 x i32].
2099/// \param __b
2100/// A 128-bit vector of [4 x i32].
2101/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2102/// parameters.
2103static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2104 __m128i __b) {
2105 return (__m128i)((__v4su)__a + (__v4su)__b);
2106}
2107
2108/// Adds two signed or unsigned 64-bit integer values, returning the
2109/// lower 64 bits of the sum.
2110///
2111/// \headerfile <x86intrin.h>
2112///
2113/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2114///
2115/// \param __a
2116/// A 64-bit integer.
2117/// \param __b
2118/// A 64-bit integer.
2119/// \returns A 64-bit integer containing the sum of both parameters.
2120static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2121 return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2122}
2123
2124/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2125/// saving the lower 64 bits of each sum in the corresponding element of a
2126/// 128-bit result vector of [2 x i64].
2127///
2128/// The integer elements of both parameters can be either signed or unsigned.
2129///
2130/// \headerfile <x86intrin.h>
2131///
2132/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2133///
2134/// \param __a
2135/// A 128-bit vector of [2 x i64].
2136/// \param __b
2137/// A 128-bit vector of [2 x i64].
2138/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2139/// parameters.
2140static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2141 __m128i __b) {
2142 return (__m128i)((__v2du)__a + (__v2du)__b);
2143}
2144
2145/// Adds, with saturation, the corresponding elements of two 128-bit
2146/// signed [16 x i8] vectors, saving each sum in the corresponding element
2147/// of a 128-bit result vector of [16 x i8].
2148///
2149/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2150/// less than 0x80 are saturated to 0x80.
2151///
2152/// \headerfile <x86intrin.h>
2153///
2154/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2155///
2156/// \param __a
2157/// A 128-bit signed [16 x i8] vector.
2158/// \param __b
2159/// A 128-bit signed [16 x i8] vector.
2160/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2161/// both parameters.
2162static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2163 __m128i __b) {
2164 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2165}
2166
2167/// Adds, with saturation, the corresponding elements of two 128-bit
2168/// signed [8 x i16] vectors, saving each sum in the corresponding element
2169/// of a 128-bit result vector of [8 x i16].
2170///
2171/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2172/// less than 0x8000 are saturated to 0x8000.
2173///
2174/// \headerfile <x86intrin.h>
2175///
2176/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2177///
2178/// \param __a
2179/// A 128-bit signed [8 x i16] vector.
2180/// \param __b
2181/// A 128-bit signed [8 x i16] vector.
2182/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2183/// both parameters.
2184static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2185 __m128i __b) {
2186 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2187}
2188
2189/// Adds, with saturation, the corresponding elements of two 128-bit
2190/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2191/// of a 128-bit result vector of [16 x i8].
2192///
2193/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2194/// saturated to 0x00.
2195///
2196/// \headerfile <x86intrin.h>
2197///
2198/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2199///
2200/// \param __a
2201/// A 128-bit unsigned [16 x i8] vector.
2202/// \param __b
2203/// A 128-bit unsigned [16 x i8] vector.
2204/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2205/// of both parameters.
2206static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2207 __m128i __b) {
2208 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2209}
2210
2211/// Adds, with saturation, the corresponding elements of two 128-bit
2212/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2213/// of a 128-bit result vector of [8 x i16].
2214///
2215/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2216/// are saturated to 0x0000.
2217///
2218/// \headerfile <x86intrin.h>
2219///
2220/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2221///
2222/// \param __a
2223/// A 128-bit unsigned [8 x i16] vector.
2224/// \param __b
2225/// A 128-bit unsigned [8 x i16] vector.
2226/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2227/// of both parameters.
2228static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2229 __m128i __b) {
2230 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2231}
2232
2233/// Computes the rounded averages of corresponding elements of two
2234/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2235/// corresponding element of a 128-bit result vector of [16 x i8].
2236///
2237/// \headerfile <x86intrin.h>
2238///
2239/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2240///
2241/// \param __a
2242/// A 128-bit unsigned [16 x i8] vector.
2243/// \param __b
2244/// A 128-bit unsigned [16 x i8] vector.
2245/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2246/// averages of both parameters.
2247static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2248 __m128i __b) {
2249 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2250}
2251
2252/// Computes the rounded averages of corresponding elements of two
2253/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2254/// corresponding element of a 128-bit result vector of [8 x i16].
2255///
2256/// \headerfile <x86intrin.h>
2257///
2258/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2259///
2260/// \param __a
2261/// A 128-bit unsigned [8 x i16] vector.
2262/// \param __b
2263/// A 128-bit unsigned [8 x i16] vector.
2264/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2265/// averages of both parameters.
2266static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2267 __m128i __b) {
2268 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2269}
2270
2271/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2272/// vectors, producing eight intermediate 32-bit signed integer products, and
2273/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2274/// [4 x i32] vector.
2275///
2276/// For example, bits [15:0] of both parameters are multiplied producing a
2277/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2278/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2279/// of the result.
2280///
2281/// \headerfile <x86intrin.h>
2282///
2283/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2284///
2285/// \param __a
2286/// A 128-bit signed [8 x i16] vector.
2287/// \param __b
2288/// A 128-bit signed [8 x i16] vector.
2289/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2290/// of both parameters.
2291static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2292 __m128i __b) {
2293 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2294}
2295
2296/// Compares corresponding elements of two 128-bit signed [8 x i16]
2297/// vectors, saving the greater value from each comparison in the
2298/// corresponding element of a 128-bit result vector of [8 x i16].
2299///
2300/// \headerfile <x86intrin.h>
2301///
2302/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2303///
2304/// \param __a
2305/// A 128-bit signed [8 x i16] vector.
2306/// \param __b
2307/// A 128-bit signed [8 x i16] vector.
2308/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2309/// each comparison.
2310static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2311 __m128i __b) {
2312 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2313}
2314
2315/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2316/// vectors, saving the greater value from each comparison in the
2317/// corresponding element of a 128-bit result vector of [16 x i8].
2318///
2319/// \headerfile <x86intrin.h>
2320///
2321/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2322///
2323/// \param __a
2324/// A 128-bit unsigned [16 x i8] vector.
2325/// \param __b
2326/// A 128-bit unsigned [16 x i8] vector.
2327/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2328/// each comparison.
2329static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2330 __m128i __b) {
2331 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2332}
2333
2334/// Compares corresponding elements of two 128-bit signed [8 x i16]
2335/// vectors, saving the smaller value from each comparison in the
2336/// corresponding element of a 128-bit result vector of [8 x i16].
2337///
2338/// \headerfile <x86intrin.h>
2339///
2340/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2341///
2342/// \param __a
2343/// A 128-bit signed [8 x i16] vector.
2344/// \param __b
2345/// A 128-bit signed [8 x i16] vector.
2346/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2347/// each comparison.
2348static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2349 __m128i __b) {
2350 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2351}
2352
2353/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2354/// vectors, saving the smaller value from each comparison in the
2355/// corresponding element of a 128-bit result vector of [16 x i8].
2356///
2357/// \headerfile <x86intrin.h>
2358///
2359/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2360///
2361/// \param __a
2362/// A 128-bit unsigned [16 x i8] vector.
2363/// \param __b
2364/// A 128-bit unsigned [16 x i8] vector.
2365/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2366/// each comparison.
2367static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2368 __m128i __b) {
2369 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2370}
2371
2372/// Multiplies the corresponding elements of two signed [8 x i16]
2373/// vectors, saving the upper 16 bits of each 32-bit product in the
2374/// corresponding element of a 128-bit signed [8 x i16] result vector.
2375///
2376/// \headerfile <x86intrin.h>
2377///
2378/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2379///
2380/// \param __a
2381/// A 128-bit signed [8 x i16] vector.
2382/// \param __b
2383/// A 128-bit signed [8 x i16] vector.
2384/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2385/// each of the eight 32-bit products.
2386static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2387 __m128i __b) {
2388 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2389}
2390
2391/// Multiplies the corresponding elements of two unsigned [8 x i16]
2392/// vectors, saving the upper 16 bits of each 32-bit product in the
2393/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2394///
2395/// \headerfile <x86intrin.h>
2396///
2397/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2398///
2399/// \param __a
2400/// A 128-bit unsigned [8 x i16] vector.
2401/// \param __b
2402/// A 128-bit unsigned [8 x i16] vector.
2403/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2404/// of each of the eight 32-bit products.
2405static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2406 __m128i __b) {
2407 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2408}
2409
2410/// Multiplies the corresponding elements of two signed [8 x i16]
2411/// vectors, saving the lower 16 bits of each 32-bit product in the
2412/// corresponding element of a 128-bit signed [8 x i16] result vector.
2413///
2414/// \headerfile <x86intrin.h>
2415///
2416/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2417///
2418/// \param __a
2419/// A 128-bit signed [8 x i16] vector.
2420/// \param __b
2421/// A 128-bit signed [8 x i16] vector.
2422/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2423/// each of the eight 32-bit products.
2424static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2425 __m128i __b) {
2426 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2427}
2428
2429/// Multiplies 32-bit unsigned integer values contained in the lower bits
2430/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2431/// product.
2432///
2433/// \headerfile <x86intrin.h>
2434///
2435/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2436///
2437/// \param __a
2438/// A 64-bit integer containing one of the source operands.
2439/// \param __b
2440/// A 64-bit integer containing one of the source operands.
2441/// \returns A 64-bit integer vector containing the product of both operands.
2442static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2443 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2444 (__v4si)__anyext128(__b)));
2445}
2446
2447/// Multiplies 32-bit unsigned integer values contained in the lower
2448/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2449/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2450///
2451/// \headerfile <x86intrin.h>
2452///
2453/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2454///
2455/// \param __a
2456/// A [2 x i64] vector containing one of the source operands.
2457/// \param __b
2458/// A [2 x i64] vector containing one of the source operands.
2459/// \returns A [2 x i64] vector containing the product of both operands.
2460static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2461 __m128i __b) {
2462 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2463}
2464
2465/// Computes the absolute differences of corresponding 8-bit integer
2466/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2467/// separately sums the second 8 absolute differences. Packs these two
2468/// unsigned 16-bit integer sums into the upper and lower elements of a
2469/// [2 x i64] vector.
2470///
2471/// \headerfile <x86intrin.h>
2472///
2473/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2474///
2475/// \param __a
2476/// A 128-bit integer vector containing one of the source operands.
2477/// \param __b
2478/// A 128-bit integer vector containing one of the source operands.
2479/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2480/// differences between both operands.
2481static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2482 __m128i __b) {
2483 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2484}
2485
2486/// Subtracts the corresponding 8-bit integer values in the operands.
2487///
2488/// \headerfile <x86intrin.h>
2489///
2490/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2491///
2492/// \param __a
2493/// A 128-bit integer vector containing the minuends.
2494/// \param __b
2495/// A 128-bit integer vector containing the subtrahends.
2496/// \returns A 128-bit integer vector containing the differences of the values
2497/// in the operands.
2498static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2499 __m128i __b) {
2500 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2501}
2502
2503/// Subtracts the corresponding 16-bit integer values in the operands.
2504///
2505/// \headerfile <x86intrin.h>
2506///
2507/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2508///
2509/// \param __a
2510/// A 128-bit integer vector containing the minuends.
2511/// \param __b
2512/// A 128-bit integer vector containing the subtrahends.
2513/// \returns A 128-bit integer vector containing the differences of the values
2514/// in the operands.
2515static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2516 __m128i __b) {
2517 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2518}
2519
2520/// Subtracts the corresponding 32-bit integer values in the operands.
2521///
2522/// \headerfile <x86intrin.h>
2523///
2524/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2525///
2526/// \param __a
2527/// A 128-bit integer vector containing the minuends.
2528/// \param __b
2529/// A 128-bit integer vector containing the subtrahends.
2530/// \returns A 128-bit integer vector containing the differences of the values
2531/// in the operands.
2532static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2533 __m128i __b) {
2534 return (__m128i)((__v4su)__a - (__v4su)__b);
2535}
2536
2537/// Subtracts signed or unsigned 64-bit integer values and writes the
2538/// difference to the corresponding bits in the destination.
2539///
2540/// \headerfile <x86intrin.h>
2541///
2542/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2543///
2544/// \param __a
2545/// A 64-bit integer vector containing the minuend.
2546/// \param __b
2547/// A 64-bit integer vector containing the subtrahend.
2548/// \returns A 64-bit integer vector containing the difference of the values in
2549/// the operands.
2550static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2551 return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2552}
2553
2554/// Subtracts the corresponding elements of two [2 x i64] vectors.
2555///
2556/// \headerfile <x86intrin.h>
2557///
2558/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2559///
2560/// \param __a
2561/// A 128-bit integer vector containing the minuends.
2562/// \param __b
2563/// A 128-bit integer vector containing the subtrahends.
2564/// \returns A 128-bit integer vector containing the differences of the values
2565/// in the operands.
2566static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2567 __m128i __b) {
2568 return (__m128i)((__v2du)__a - (__v2du)__b);
2569}
2570
2571/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2572/// the input and returns the differences in the corresponding bytes in the
2573/// destination.
2574///
2575/// Differences greater than 0x7F are saturated to 0x7F, and differences
2576/// less than 0x80 are saturated to 0x80.
2577///
2578/// \headerfile <x86intrin.h>
2579///
2580/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2581///
2582/// \param __a
2583/// A 128-bit integer vector containing the minuends.
2584/// \param __b
2585/// A 128-bit integer vector containing the subtrahends.
2586/// \returns A 128-bit integer vector containing the differences of the values
2587/// in the operands.
2588static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2589 __m128i __b) {
2590 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2591}
2592
2593/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2594/// the input and returns the differences in the corresponding bytes in the
2595/// destination.
2596///
2597/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2598/// than 0x8000 are saturated to 0x8000.
2599///
2600/// \headerfile <x86intrin.h>
2601///
2602/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2603///
2604/// \param __a
2605/// A 128-bit integer vector containing the minuends.
2606/// \param __b
2607/// A 128-bit integer vector containing the subtrahends.
2608/// \returns A 128-bit integer vector containing the differences of the values
2609/// in the operands.
2610static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2611 __m128i __b) {
2612 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2613}
2614
2615/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2616/// the input and returns the differences in the corresponding bytes in the
2617/// destination.
2618///
2619/// Differences less than 0x00 are saturated to 0x00.
2620///
2621/// \headerfile <x86intrin.h>
2622///
2623/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2624///
2625/// \param __a
2626/// A 128-bit integer vector containing the minuends.
2627/// \param __b
2628/// A 128-bit integer vector containing the subtrahends.
2629/// \returns A 128-bit integer vector containing the unsigned integer
2630/// differences of the values in the operands.
2631static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2632 __m128i __b) {
2633 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2634}
2635
2636/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2637/// the input and returns the differences in the corresponding bytes in the
2638/// destination.
2639///
2640/// Differences less than 0x0000 are saturated to 0x0000.
2641///
2642/// \headerfile <x86intrin.h>
2643///
2644/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2645///
2646/// \param __a
2647/// A 128-bit integer vector containing the minuends.
2648/// \param __b
2649/// A 128-bit integer vector containing the subtrahends.
2650/// \returns A 128-bit integer vector containing the unsigned integer
2651/// differences of the values in the operands.
2652static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2653 __m128i __b) {
2654 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2655}
2656
2657/// Performs a bitwise AND of two 128-bit integer vectors.
2658///
2659/// \headerfile <x86intrin.h>
2660///
2661/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2662///
2663/// \param __a
2664/// A 128-bit integer vector containing one of the source operands.
2665/// \param __b
2666/// A 128-bit integer vector containing one of the source operands.
2667/// \returns A 128-bit integer vector containing the bitwise AND of the values
2668/// in both operands.
2669static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2670 __m128i __b) {
2671 return (__m128i)((__v2du)__a & (__v2du)__b);
2672}
2673
2674/// Performs a bitwise AND of two 128-bit integer vectors, using the
2675/// one's complement of the values contained in the first source operand.
2676///
2677/// \headerfile <x86intrin.h>
2678///
2679/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2680///
2681/// \param __a
2682/// A 128-bit vector containing the left source operand. The one's complement
2683/// of this value is used in the bitwise AND.
2684/// \param __b
2685/// A 128-bit vector containing the right source operand.
2686/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2687/// complement of the first operand and the values in the second operand.
2688static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2689 __m128i __b) {
2690 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2691}
2692/// Performs a bitwise OR of two 128-bit integer vectors.
2693///
2694/// \headerfile <x86intrin.h>
2695///
2696/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2697///
2698/// \param __a
2699/// A 128-bit integer vector containing one of the source operands.
2700/// \param __b
2701/// A 128-bit integer vector containing one of the source operands.
2702/// \returns A 128-bit integer vector containing the bitwise OR of the values
2703/// in both operands.
2704static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2705 __m128i __b) {
2706 return (__m128i)((__v2du)__a | (__v2du)__b);
2707}
2708
2709/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2710///
2711/// \headerfile <x86intrin.h>
2712///
2713/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2714///
2715/// \param __a
2716/// A 128-bit integer vector containing one of the source operands.
2717/// \param __b
2718/// A 128-bit integer vector containing one of the source operands.
2719/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2720/// values in both operands.
2721static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2722 __m128i __b) {
2723 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2724}
2725
2726/// Left-shifts the 128-bit integer vector operand by the specified
2727/// number of bytes. Low-order bits are cleared.
2728///
2729/// \headerfile <x86intrin.h>
2730///
2731/// \code
2732/// __m128i _mm_slli_si128(__m128i a, const int imm);
2733/// \endcode
2734///
2735/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2736///
2737/// \param a
2738/// A 128-bit integer vector containing the source operand.
2739/// \param imm
2740/// An immediate value specifying the number of bytes to left-shift operand
2741/// \a a.
2742/// \returns A 128-bit integer vector containing the left-shifted value.
2743#define _mm_slli_si128(a, imm) \
2744 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2745 (int)(imm)))
2746
2747#define _mm_bslli_si128(a, imm) \
2748 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2749 (int)(imm)))
2750
2751/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2752/// by the specified number of bits. Low-order bits are cleared.
2753///
2754/// \headerfile <x86intrin.h>
2755///
2756/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2757///
2758/// \param __a
2759/// A 128-bit integer vector containing the source operand.
2760/// \param __count
2761/// An integer value specifying the number of bits to left-shift each value
2762/// in operand \a __a.
2763/// \returns A 128-bit integer vector containing the left-shifted values.
2764static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2765 int __count) {
2766 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2767}
2768
2769/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2770/// by the specified number of bits. Low-order bits are cleared.
2771///
2772/// \headerfile <x86intrin.h>
2773///
2774/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2775///
2776/// \param __a
2777/// A 128-bit integer vector containing the source operand.
2778/// \param __count
2779/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2780/// to left-shift each value in operand \a __a.
2781/// \returns A 128-bit integer vector containing the left-shifted values.
2782static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2783 __m128i __count) {
2784 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2785}
2786
2787/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2788/// by the specified number of bits. Low-order bits are cleared.
2789///
2790/// \headerfile <x86intrin.h>
2791///
2792/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2793///
2794/// \param __a
2795/// A 128-bit integer vector containing the source operand.
2796/// \param __count
2797/// An integer value specifying the number of bits to left-shift each value
2798/// in operand \a __a.
2799/// \returns A 128-bit integer vector containing the left-shifted values.
2800static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2801 int __count) {
2802 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2803}
2804
2805/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2806/// by the specified number of bits. Low-order bits are cleared.
2807///
2808/// \headerfile <x86intrin.h>
2809///
2810/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2811///
2812/// \param __a
2813/// A 128-bit integer vector containing the source operand.
2814/// \param __count
2815/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2816/// to left-shift each value in operand \a __a.
2817/// \returns A 128-bit integer vector containing the left-shifted values.
2818static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2819 __m128i __count) {
2820 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2821}
2822
2823/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2824/// by the specified number of bits. Low-order bits are cleared.
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2829///
2830/// \param __a
2831/// A 128-bit integer vector containing the source operand.
2832/// \param __count
2833/// An integer value specifying the number of bits to left-shift each value
2834/// in operand \a __a.
2835/// \returns A 128-bit integer vector containing the left-shifted values.
2836static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2837 int __count) {
2838 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2839}
2840
2841/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2842/// by the specified number of bits. Low-order bits are cleared.
2843///
2844/// \headerfile <x86intrin.h>
2845///
2846/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2847///
2848/// \param __a
2849/// A 128-bit integer vector containing the source operand.
2850/// \param __count
2851/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2852/// to left-shift each value in operand \a __a.
2853/// \returns A 128-bit integer vector containing the left-shifted values.
2854static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2855 __m128i __count) {
2856 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2857}
2858
2859/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2860/// by the specified number of bits. High-order bits are filled with the sign
2861/// bit of the initial value.
2862///
2863/// \headerfile <x86intrin.h>
2864///
2865/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2866///
2867/// \param __a
2868/// A 128-bit integer vector containing the source operand.
2869/// \param __count
2870/// An integer value specifying the number of bits to right-shift each value
2871/// in operand \a __a.
2872/// \returns A 128-bit integer vector containing the right-shifted values.
2873static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2874 int __count) {
2875 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2876}
2877
2878/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2879/// by the specified number of bits. High-order bits are filled with the sign
2880/// bit of the initial value.
2881///
2882/// \headerfile <x86intrin.h>
2883///
2884/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2885///
2886/// \param __a
2887/// A 128-bit integer vector containing the source operand.
2888/// \param __count
2889/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2890/// to right-shift each value in operand \a __a.
2891/// \returns A 128-bit integer vector containing the right-shifted values.
2892static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2893 __m128i __count) {
2894 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2895}
2896
2897/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2898/// by the specified number of bits. High-order bits are filled with the sign
2899/// bit of the initial value.
2900///
2901/// \headerfile <x86intrin.h>
2902///
2903/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2904///
2905/// \param __a
2906/// A 128-bit integer vector containing the source operand.
2907/// \param __count
2908/// An integer value specifying the number of bits to right-shift each value
2909/// in operand \a __a.
2910/// \returns A 128-bit integer vector containing the right-shifted values.
2911static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2912 int __count) {
2913 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2914}
2915
2916/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2917/// by the specified number of bits. High-order bits are filled with the sign
2918/// bit of the initial value.
2919///
2920/// \headerfile <x86intrin.h>
2921///
2922/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2923///
2924/// \param __a
2925/// A 128-bit integer vector containing the source operand.
2926/// \param __count
2927/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2928/// to right-shift each value in operand \a __a.
2929/// \returns A 128-bit integer vector containing the right-shifted values.
2930static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2931 __m128i __count) {
2932 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2933}
2934
2935/// Right-shifts the 128-bit integer vector operand by the specified
2936/// number of bytes. High-order bits are cleared.
2937///
2938/// \headerfile <x86intrin.h>
2939///
2940/// \code
2941/// __m128i _mm_srli_si128(__m128i a, const int imm);
2942/// \endcode
2943///
2944/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2945///
2946/// \param a
2947/// A 128-bit integer vector containing the source operand.
2948/// \param imm
2949/// An immediate value specifying the number of bytes to right-shift operand
2950/// \a a.
2951/// \returns A 128-bit integer vector containing the right-shifted value.
2952#define _mm_srli_si128(a, imm) \
2953 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2954 (int)(imm)))
2955
2956#define _mm_bsrli_si128(a, imm) \
2957 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2958 (int)(imm)))
2959
2960/// Right-shifts each of 16-bit values in the 128-bit integer vector
2961/// operand by the specified number of bits. High-order bits are cleared.
2962///
2963/// \headerfile <x86intrin.h>
2964///
2965/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2966///
2967/// \param __a
2968/// A 128-bit integer vector containing the source operand.
2969/// \param __count
2970/// An integer value specifying the number of bits to right-shift each value
2971/// in operand \a __a.
2972/// \returns A 128-bit integer vector containing the right-shifted values.
2973static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2974 int __count) {
2975 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2976}
2977
2978/// Right-shifts each of 16-bit values in the 128-bit integer vector
2979/// operand by the specified number of bits. High-order bits are cleared.
2980///
2981/// \headerfile <x86intrin.h>
2982///
2983/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2984///
2985/// \param __a
2986/// A 128-bit integer vector containing the source operand.
2987/// \param __count
2988/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2989/// to right-shift each value in operand \a __a.
2990/// \returns A 128-bit integer vector containing the right-shifted values.
2991static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2992 __m128i __count) {
2993 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2994}
2995
2996/// Right-shifts each of 32-bit values in the 128-bit integer vector
2997/// operand by the specified number of bits. High-order bits are cleared.
2998///
2999/// \headerfile <x86intrin.h>
3000///
3001/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3002///
3003/// \param __a
3004/// A 128-bit integer vector containing the source operand.
3005/// \param __count
3006/// An integer value specifying the number of bits to right-shift each value
3007/// in operand \a __a.
3008/// \returns A 128-bit integer vector containing the right-shifted values.
3009static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3010 int __count) {
3011 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3012}
3013
3014/// Right-shifts each of 32-bit values in the 128-bit integer vector
3015/// operand by the specified number of bits. High-order bits are cleared.
3016///
3017/// \headerfile <x86intrin.h>
3018///
3019/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3020///
3021/// \param __a
3022/// A 128-bit integer vector containing the source operand.
3023/// \param __count
3024/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3025/// to right-shift each value in operand \a __a.
3026/// \returns A 128-bit integer vector containing the right-shifted values.
3027static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3028 __m128i __count) {
3029 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3030}
3031
3032/// Right-shifts each of 64-bit values in the 128-bit integer vector
3033/// operand by the specified number of bits. High-order bits are cleared.
3034///
3035/// \headerfile <x86intrin.h>
3036///
3037/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3038///
3039/// \param __a
3040/// A 128-bit integer vector containing the source operand.
3041/// \param __count
3042/// An integer value specifying the number of bits to right-shift each value
3043/// in operand \a __a.
3044/// \returns A 128-bit integer vector containing the right-shifted values.
3045static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3046 int __count) {
3047 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3048}
3049
3050/// Right-shifts each of 64-bit values in the 128-bit integer vector
3051/// operand by the specified number of bits. High-order bits are cleared.
3052///
3053/// \headerfile <x86intrin.h>
3054///
3055/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3056///
3057/// \param __a
3058/// A 128-bit integer vector containing the source operand.
3059/// \param __count
3060/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3061/// to right-shift each value in operand \a __a.
3062/// \returns A 128-bit integer vector containing the right-shifted values.
3063static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3064 __m128i __count) {
3065 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3066}
3067
3068/// Compares each of the corresponding 8-bit values of the 128-bit
3069/// integer vectors for equality.
3070///
3071/// Each comparison returns 0x0 for false, 0xFF for true.
3072///
3073/// \headerfile <x86intrin.h>
3074///
3075/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3076///
3077/// \param __a
3078/// A 128-bit integer vector.
3079/// \param __b
3080/// A 128-bit integer vector.
3081/// \returns A 128-bit integer vector containing the comparison results.
3082static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3083 __m128i __b) {
3084 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3085}
3086
3087/// Compares each of the corresponding 16-bit values of the 128-bit
3088/// integer vectors for equality.
3089///
3090/// Each comparison returns 0x0 for false, 0xFFFF for true.
3091///
3092/// \headerfile <x86intrin.h>
3093///
3094/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3095///
3096/// \param __a
3097/// A 128-bit integer vector.
3098/// \param __b
3099/// A 128-bit integer vector.
3100/// \returns A 128-bit integer vector containing the comparison results.
3101static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3102 __m128i __b) {
3103 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3104}
3105
3106/// Compares each of the corresponding 32-bit values of the 128-bit
3107/// integer vectors for equality.
3108///
3109/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3110///
3111/// \headerfile <x86intrin.h>
3112///
3113/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3114///
3115/// \param __a
3116/// A 128-bit integer vector.
3117/// \param __b
3118/// A 128-bit integer vector.
3119/// \returns A 128-bit integer vector containing the comparison results.
3120static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3121 __m128i __b) {
3122 return (__m128i)((__v4si)__a == (__v4si)__b);
3123}
3124
3125/// Compares each of the corresponding signed 8-bit values of the 128-bit
3126/// integer vectors to determine if the values in the first operand are
3127/// greater than those in the second operand.
3128///
3129/// Each comparison returns 0x0 for false, 0xFF for true.
3130///
3131/// \headerfile <x86intrin.h>
3132///
3133/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134///
3135/// \param __a
3136/// A 128-bit integer vector.
3137/// \param __b
3138/// A 128-bit integer vector.
3139/// \returns A 128-bit integer vector containing the comparison results.
3140static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3141 __m128i __b) {
3142 /* This function always performs a signed comparison, but __v16qi is a char
3143 which may be signed or unsigned, so use __v16qs. */
3144 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3145}
3146
3147/// Compares each of the corresponding signed 16-bit values of the
3148/// 128-bit integer vectors to determine if the values in the first operand
3149/// are greater than those in the second operand.
3150///
3151/// Each comparison returns 0x0 for false, 0xFFFF for true.
3152///
3153/// \headerfile <x86intrin.h>
3154///
3155/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3156///
3157/// \param __a
3158/// A 128-bit integer vector.
3159/// \param __b
3160/// A 128-bit integer vector.
3161/// \returns A 128-bit integer vector containing the comparison results.
3162static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3163 __m128i __b) {
3164 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3165}
3166
3167/// Compares each of the corresponding signed 32-bit values of the
3168/// 128-bit integer vectors to determine if the values in the first operand
3169/// are greater than those in the second operand.
3170///
3171/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3172///
3173/// \headerfile <x86intrin.h>
3174///
3175/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3176///
3177/// \param __a
3178/// A 128-bit integer vector.
3179/// \param __b
3180/// A 128-bit integer vector.
3181/// \returns A 128-bit integer vector containing the comparison results.
3182static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3183 __m128i __b) {
3184 return (__m128i)((__v4si)__a > (__v4si)__b);
3185}
3186
3187/// Compares each of the corresponding signed 8-bit values of the 128-bit
3188/// integer vectors to determine if the values in the first operand are less
3189/// than those in the second operand.
3190///
3191/// Each comparison returns 0x0 for false, 0xFF for true.
3192///
3193/// \headerfile <x86intrin.h>
3194///
3195/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3196///
3197/// \param __a
3198/// A 128-bit integer vector.
3199/// \param __b
3200/// A 128-bit integer vector.
3201/// \returns A 128-bit integer vector containing the comparison results.
3202static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3203 __m128i __b) {
3204 return _mm_cmpgt_epi8(__b, __a);
3205}
3206
3207/// Compares each of the corresponding signed 16-bit values of the
3208/// 128-bit integer vectors to determine if the values in the first operand
3209/// are less than those in the second operand.
3210///
3211/// Each comparison returns 0x0 for false, 0xFFFF for true.
3212///
3213/// \headerfile <x86intrin.h>
3214///
3215/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3216///
3217/// \param __a
3218/// A 128-bit integer vector.
3219/// \param __b
3220/// A 128-bit integer vector.
3221/// \returns A 128-bit integer vector containing the comparison results.
3222static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3223 __m128i __b) {
3224 return _mm_cmpgt_epi16(__b, __a);
3225}
3226
3227/// Compares each of the corresponding signed 32-bit values of the
3228/// 128-bit integer vectors to determine if the values in the first operand
3229/// are less than those in the second operand.
3230///
3231/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3232///
3233/// \headerfile <x86intrin.h>
3234///
3235/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3236///
3237/// \param __a
3238/// A 128-bit integer vector.
3239/// \param __b
3240/// A 128-bit integer vector.
3241/// \returns A 128-bit integer vector containing the comparison results.
3242static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3243 __m128i __b) {
3244 return _mm_cmpgt_epi32(__b, __a);
3245}
3246
3247#ifdef __x86_64__
3248/// Converts a 64-bit signed integer value from the second operand into a
3249/// double-precision value and returns it in the lower element of a [2 x
3250/// double] vector; the upper element of the returned vector is copied from
3251/// the upper element of the first operand.
3252///
3253/// \headerfile <x86intrin.h>
3254///
3255/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3256///
3257/// \param __a
3258/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3259/// copied to the upper 64 bits of the destination.
3260/// \param __b
3261/// A 64-bit signed integer operand containing the value to be converted.
3262/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3263/// converted value of the second operand. The upper 64 bits are copied from
3264/// the upper 64 bits of the first operand.
3265static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3266 long long __b) {
3267 __a[0] = __b;
3268 return __a;
3269}
3270
3271/// Converts the first (lower) element of a vector of [2 x double] into a
3272/// 64-bit signed integer value.
3273///
3274/// If the converted value does not fit in a 64-bit integer, raises a
3275/// floating-point invalid exception. If the exception is masked, returns
3276/// the most negative integer.
3277///
3278/// \headerfile <x86intrin.h>
3279///
3280/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3281///
3282/// \param __a
3283/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3284/// conversion.
3285/// \returns A 64-bit signed integer containing the converted value.
3286static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3287 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3288}
3289
3290/// Converts the first (lower) element of a vector of [2 x double] into a
3291/// 64-bit signed truncated (rounded toward zero) integer value.
3292///
3293/// If a converted value does not fit in a 64-bit integer, raises a
3294/// floating-point invalid exception. If the exception is masked, returns
3295/// the most negative integer.
3296///
3297/// \headerfile <x86intrin.h>
3298///
3299/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3300/// instruction.
3301///
3302/// \param __a
3303/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3304/// conversion.
3305/// \returns A 64-bit signed integer containing the converted value.
3306static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3307 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3308}
3309#endif
3310
3311/// Converts a vector of [4 x i32] into a vector of [4 x float].
3312///
3313/// \headerfile <x86intrin.h>
3314///
3315/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3316///
3317/// \param __a
3318/// A 128-bit integer vector.
3319/// \returns A 128-bit vector of [4 x float] containing the converted values.
3320static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3321 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3322}
3323
3324/// Converts a vector of [4 x float] into a vector of [4 x i32].
3325///
3326/// If a converted value does not fit in a 32-bit integer, raises a
3327/// floating-point invalid exception. If the exception is masked, returns
3328/// the most negative integer.
3329///
3330/// \headerfile <x86intrin.h>
3331///
3332/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3333///
3334/// \param __a
3335/// A 128-bit vector of [4 x float].
3336/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3337/// values.
3338static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3339 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3340}
3341
3342/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3343/// zero) 32-bit integers, returned in a vector of [4 x i32].
3344///
3345/// If a converted value does not fit in a 32-bit integer, raises a
3346/// floating-point invalid exception. If the exception is masked, returns
3347/// the most negative integer.
3348///
3349/// \headerfile <x86intrin.h>
3350///
3351/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3352/// instruction.
3353///
3354/// \param __a
3355/// A 128-bit vector of [4 x float].
3356/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3357static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3358 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3359}
3360
3361/// Returns a vector of [4 x i32] where the lowest element is the input
3362/// operand and the remaining elements are zero.
3363///
3364/// \headerfile <x86intrin.h>
3365///
3366/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3367///
3368/// \param __a
3369/// A 32-bit signed integer operand.
3370/// \returns A 128-bit vector of [4 x i32].
3371static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3372 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3373}
3374
3375/// Returns a vector of [2 x i64] where the lower element is the input
3376/// operand and the upper element is zero.
3377///
3378/// \headerfile <x86intrin.h>
3379///
3380/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3381/// in 64-bit mode.
3382///
3383/// \param __a
3384/// A 64-bit signed integer operand containing the value to be converted.
3385/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3386static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3387 return __extension__(__m128i)(__v2di){__a, 0};
3388}
3389
3390/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3391/// 32-bit signed integer value.
3392///
3393/// \headerfile <x86intrin.h>
3394///
3395/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3396///
3397/// \param __a
3398/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3399/// destination.
3400/// \returns A 32-bit signed integer containing the moved value.
3401static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3402 __v4si __b = (__v4si)__a;
3403 return __b[0];
3404}
3405
3406/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3407/// 64-bit signed integer value.
3408///
3409/// \headerfile <x86intrin.h>
3410///
3411/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3412///
3413/// \param __a
3414/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3415/// destination.
3416/// \returns A 64-bit signed integer containing the moved value.
3417static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3418 return __a[0];
3419}
3420
3421/// Moves packed integer values from an aligned 128-bit memory location
3422/// to elements in a 128-bit integer vector.
3423///
3424/// \headerfile <x86intrin.h>
3425///
3426/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3427///
3428/// \param __p
3429/// An aligned pointer to a memory location containing integer values.
3430/// \returns A 128-bit integer vector containing the moved values.
3431static __inline__ __m128i __DEFAULT_FN_ATTRS
3432_mm_load_si128(__m128i const *__p) {
3433 return *__p;
3434}
3435
3436/// Moves packed integer values from an unaligned 128-bit memory location
3437/// to elements in a 128-bit integer vector.
3438///
3439/// \headerfile <x86intrin.h>
3440///
3441/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3442///
3443/// \param __p
3444/// A pointer to a memory location containing integer values.
3445/// \returns A 128-bit integer vector containing the moved values.
3446static __inline__ __m128i __DEFAULT_FN_ATTRS
3447_mm_loadu_si128(__m128i_u const *__p) {
3448 struct __loadu_si128 {
3449 __m128i_u __v;
3450 } __attribute__((__packed__, __may_alias__));
3451 return ((const struct __loadu_si128 *)__p)->__v;
3452}
3453
3454/// Returns a vector of [2 x i64] where the lower element is taken from
3455/// the lower element of the operand, and the upper element is zero.
3456///
3457/// \headerfile <x86intrin.h>
3458///
3459/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3460///
3461/// \param __p
3462/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3463/// the destination.
3464/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3465/// moved value. The higher order bits are cleared.
3466static __inline__ __m128i __DEFAULT_FN_ATTRS
3467_mm_loadl_epi64(__m128i_u const *__p) {
3468 struct __mm_loadl_epi64_struct {
3469 long long __u;
3470 } __attribute__((__packed__, __may_alias__));
3471 return __extension__(__m128i){
3472 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3473}
3474
3475/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3476/// This could be used as an argument to another intrinsic function where the
3477/// argument is required but the value is not actually used.
3478///
3479/// \headerfile <x86intrin.h>
3480///
3481/// This intrinsic has no corresponding instruction.
3482///
3483/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3484static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3485 return (__m128i)__builtin_ia32_undef128();
3486}
3487
3488/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3489/// the specified 64-bit integer values.
3490///
3491/// \headerfile <x86intrin.h>
3492///
3493/// This intrinsic is a utility function and does not correspond to a specific
3494/// instruction.
3495///
3496/// \param __q1
3497/// A 64-bit integer value used to initialize the upper 64 bits of the
3498/// destination vector of [2 x i64].
3499/// \param __q0
3500/// A 64-bit integer value used to initialize the lower 64 bits of the
3501/// destination vector of [2 x i64].
3502/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3503/// provided in the operands.
3504static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3505 long long __q0) {
3506 return __extension__(__m128i)(__v2di){__q0, __q1};
3507}
3508
3509/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3510/// the specified 64-bit integer values.
3511///
3512/// \headerfile <x86intrin.h>
3513///
3514/// This intrinsic is a utility function and does not correspond to a specific
3515/// instruction.
3516///
3517/// \param __q1
3518/// A 64-bit integer value used to initialize the upper 64 bits of the
3519/// destination vector of [2 x i64].
3520/// \param __q0
3521/// A 64-bit integer value used to initialize the lower 64 bits of the
3522/// destination vector of [2 x i64].
3523/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3524/// provided in the operands.
3525static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3526 __m64 __q0) {
3527 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3528}
3529
3530/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3531/// the specified 32-bit integer values.
3532///
3533/// \headerfile <x86intrin.h>
3534///
3535/// This intrinsic is a utility function and does not correspond to a specific
3536/// instruction.
3537///
3538/// \param __i3
3539/// A 32-bit integer value used to initialize bits [127:96] of the
3540/// destination vector.
3541/// \param __i2
3542/// A 32-bit integer value used to initialize bits [95:64] of the destination
3543/// vector.
3544/// \param __i1
3545/// A 32-bit integer value used to initialize bits [63:32] of the destination
3546/// vector.
3547/// \param __i0
3548/// A 32-bit integer value used to initialize bits [31:0] of the destination
3549/// vector.
3550/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3551/// provided in the operands.
3552static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3553 int __i1, int __i0) {
3554 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3555}
3556
3557/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3558/// the specified 16-bit integer values.
3559///
3560/// \headerfile <x86intrin.h>
3561///
3562/// This intrinsic is a utility function and does not correspond to a specific
3563/// instruction.
3564///
3565/// \param __w7
3566/// A 16-bit integer value used to initialize bits [127:112] of the
3567/// destination vector.
3568/// \param __w6
3569/// A 16-bit integer value used to initialize bits [111:96] of the
3570/// destination vector.
3571/// \param __w5
3572/// A 16-bit integer value used to initialize bits [95:80] of the destination
3573/// vector.
3574/// \param __w4
3575/// A 16-bit integer value used to initialize bits [79:64] of the destination
3576/// vector.
3577/// \param __w3
3578/// A 16-bit integer value used to initialize bits [63:48] of the destination
3579/// vector.
3580/// \param __w2
3581/// A 16-bit integer value used to initialize bits [47:32] of the destination
3582/// vector.
3583/// \param __w1
3584/// A 16-bit integer value used to initialize bits [31:16] of the destination
3585/// vector.
3586/// \param __w0
3587/// A 16-bit integer value used to initialize bits [15:0] of the destination
3588/// vector.
3589/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3590/// provided in the operands.
3591static __inline__ __m128i __DEFAULT_FN_ATTRS
3592_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3593 short __w2, short __w1, short __w0) {
3594 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3595 __w4, __w5, __w6, __w7};
3596}
3597
3598/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3599/// the specified 8-bit integer values.
3600///
3601/// \headerfile <x86intrin.h>
3602///
3603/// This intrinsic is a utility function and does not correspond to a specific
3604/// instruction.
3605///
3606/// \param __b15
3607/// Initializes bits [127:120] of the destination vector.
3608/// \param __b14
3609/// Initializes bits [119:112] of the destination vector.
3610/// \param __b13
3611/// Initializes bits [111:104] of the destination vector.
3612/// \param __b12
3613/// Initializes bits [103:96] of the destination vector.
3614/// \param __b11
3615/// Initializes bits [95:88] of the destination vector.
3616/// \param __b10
3617/// Initializes bits [87:80] of the destination vector.
3618/// \param __b9
3619/// Initializes bits [79:72] of the destination vector.
3620/// \param __b8
3621/// Initializes bits [71:64] of the destination vector.
3622/// \param __b7
3623/// Initializes bits [63:56] of the destination vector.
3624/// \param __b6
3625/// Initializes bits [55:48] of the destination vector.
3626/// \param __b5
3627/// Initializes bits [47:40] of the destination vector.
3628/// \param __b4
3629/// Initializes bits [39:32] of the destination vector.
3630/// \param __b3
3631/// Initializes bits [31:24] of the destination vector.
3632/// \param __b2
3633/// Initializes bits [23:16] of the destination vector.
3634/// \param __b1
3635/// Initializes bits [15:8] of the destination vector.
3636/// \param __b0
3637/// Initializes bits [7:0] of the destination vector.
3638/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3639/// provided in the operands.
3640static __inline__ __m128i __DEFAULT_FN_ATTRS
3641_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3642 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3643 char __b4, char __b3, char __b2, char __b1, char __b0) {
3644 return __extension__(__m128i)(__v16qi){
3645 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3646 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3647}
3648
3649/// Initializes both values in a 128-bit integer vector with the
3650/// specified 64-bit integer value.
3651///
3652/// \headerfile <x86intrin.h>
3653///
3654/// This intrinsic is a utility function and does not correspond to a specific
3655/// instruction.
3656///
3657/// \param __q
3658/// Integer value used to initialize the elements of the destination integer
3659/// vector.
3660/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3661/// elements containing the value provided in the operand.
3662static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3663 return _mm_set_epi64x(__q, __q);
3664}
3665
3666/// Initializes both values in a 128-bit vector of [2 x i64] with the
3667/// specified 64-bit value.
3668///
3669/// \headerfile <x86intrin.h>
3670///
3671/// This intrinsic is a utility function and does not correspond to a specific
3672/// instruction.
3673///
3674/// \param __q
3675/// A 64-bit value used to initialize the elements of the destination integer
3676/// vector.
3677/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3678/// containing the value provided in the operand.
3679static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3680 return _mm_set_epi64(__q, __q);
3681}
3682
3683/// Initializes all values in a 128-bit vector of [4 x i32] with the
3684/// specified 32-bit value.
3685///
3686/// \headerfile <x86intrin.h>
3687///
3688/// This intrinsic is a utility function and does not correspond to a specific
3689/// instruction.
3690///
3691/// \param __i
3692/// A 32-bit value used to initialize the elements of the destination integer
3693/// vector.
3694/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3695/// containing the value provided in the operand.
3696static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3697 return _mm_set_epi32(__i, __i, __i, __i);
3698}
3699
3700/// Initializes all values in a 128-bit vector of [8 x i16] with the
3701/// specified 16-bit value.
3702///
3703/// \headerfile <x86intrin.h>
3704///
3705/// This intrinsic is a utility function and does not correspond to a specific
3706/// instruction.
3707///
3708/// \param __w
3709/// A 16-bit value used to initialize the elements of the destination integer
3710/// vector.
3711/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3712/// containing the value provided in the operand.
3713static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3714 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3715}
3716
3717/// Initializes all values in a 128-bit vector of [16 x i8] with the
3718/// specified 8-bit value.
3719///
3720/// \headerfile <x86intrin.h>
3721///
3722/// This intrinsic is a utility function and does not correspond to a specific
3723/// instruction.
3724///
3725/// \param __b
3726/// An 8-bit value used to initialize the elements of the destination integer
3727/// vector.
3728/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3729/// containing the value provided in the operand.
3730static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3731 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3732 __b, __b, __b, __b, __b);
3733}
3734
3735/// Constructs a 128-bit integer vector, initialized in reverse order
3736/// with the specified 64-bit integral values.
3737///
3738/// \headerfile <x86intrin.h>
3739///
3740/// This intrinsic does not correspond to a specific instruction.
3741///
3742/// \param __q0
3743/// A 64-bit integral value used to initialize the lower 64 bits of the
3744/// result.
3745/// \param __q1
3746/// A 64-bit integral value used to initialize the upper 64 bits of the
3747/// result.
3748/// \returns An initialized 128-bit integer vector.
3749static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3750 __m64 __q1) {
3751 return _mm_set_epi64(__q1, __q0);
3752}
3753
3754/// Constructs a 128-bit integer vector, initialized in reverse order
3755/// with the specified 32-bit integral values.
3756///
3757/// \headerfile <x86intrin.h>
3758///
3759/// This intrinsic is a utility function and does not correspond to a specific
3760/// instruction.
3761///
3762/// \param __i0
3763/// A 32-bit integral value used to initialize bits [31:0] of the result.
3764/// \param __i1
3765/// A 32-bit integral value used to initialize bits [63:32] of the result.
3766/// \param __i2
3767/// A 32-bit integral value used to initialize bits [95:64] of the result.
3768/// \param __i3
3769/// A 32-bit integral value used to initialize bits [127:96] of the result.
3770/// \returns An initialized 128-bit integer vector.
3771static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3772 int __i2,
3773 int __i3) {
3774 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3775}
3776
3777/// Constructs a 128-bit integer vector, initialized in reverse order
3778/// with the specified 16-bit integral values.
3779///
3780/// \headerfile <x86intrin.h>
3781///
3782/// This intrinsic is a utility function and does not correspond to a specific
3783/// instruction.
3784///
3785/// \param __w0
3786/// A 16-bit integral value used to initialize bits [15:0] of the result.
3787/// \param __w1
3788/// A 16-bit integral value used to initialize bits [31:16] of the result.
3789/// \param __w2
3790/// A 16-bit integral value used to initialize bits [47:32] of the result.
3791/// \param __w3
3792/// A 16-bit integral value used to initialize bits [63:48] of the result.
3793/// \param __w4
3794/// A 16-bit integral value used to initialize bits [79:64] of the result.
3795/// \param __w5
3796/// A 16-bit integral value used to initialize bits [95:80] of the result.
3797/// \param __w6
3798/// A 16-bit integral value used to initialize bits [111:96] of the result.
3799/// \param __w7
3800/// A 16-bit integral value used to initialize bits [127:112] of the result.
3801/// \returns An initialized 128-bit integer vector.
3802static __inline__ __m128i __DEFAULT_FN_ATTRS
3803_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3804 short __w5, short __w6, short __w7) {
3805 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3806}
3807
3808/// Constructs a 128-bit integer vector, initialized in reverse order
3809/// with the specified 8-bit integral values.
3810///
3811/// \headerfile <x86intrin.h>
3812///
3813/// This intrinsic is a utility function and does not correspond to a specific
3814/// instruction.
3815///
3816/// \param __b0
3817/// An 8-bit integral value used to initialize bits [7:0] of the result.
3818/// \param __b1
3819/// An 8-bit integral value used to initialize bits [15:8] of the result.
3820/// \param __b2
3821/// An 8-bit integral value used to initialize bits [23:16] of the result.
3822/// \param __b3
3823/// An 8-bit integral value used to initialize bits [31:24] of the result.
3824/// \param __b4
3825/// An 8-bit integral value used to initialize bits [39:32] of the result.
3826/// \param __b5
3827/// An 8-bit integral value used to initialize bits [47:40] of the result.
3828/// \param __b6
3829/// An 8-bit integral value used to initialize bits [55:48] of the result.
3830/// \param __b7
3831/// An 8-bit integral value used to initialize bits [63:56] of the result.
3832/// \param __b8
3833/// An 8-bit integral value used to initialize bits [71:64] of the result.
3834/// \param __b9
3835/// An 8-bit integral value used to initialize bits [79:72] of the result.
3836/// \param __b10
3837/// An 8-bit integral value used to initialize bits [87:80] of the result.
3838/// \param __b11
3839/// An 8-bit integral value used to initialize bits [95:88] of the result.
3840/// \param __b12
3841/// An 8-bit integral value used to initialize bits [103:96] of the result.
3842/// \param __b13
3843/// An 8-bit integral value used to initialize bits [111:104] of the result.
3844/// \param __b14
3845/// An 8-bit integral value used to initialize bits [119:112] of the result.
3846/// \param __b15
3847/// An 8-bit integral value used to initialize bits [127:120] of the result.
3848/// \returns An initialized 128-bit integer vector.
3849static __inline__ __m128i __DEFAULT_FN_ATTRS
3850_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3851 char __b6, char __b7, char __b8, char __b9, char __b10,
3852 char __b11, char __b12, char __b13, char __b14, char __b15) {
3853 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3854 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3855}
3856
3857/// Creates a 128-bit integer vector initialized to zero.
3858///
3859/// \headerfile <x86intrin.h>
3860///
3861/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3862///
3863/// \returns An initialized 128-bit integer vector with all elements set to
3864/// zero.
3865static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3866 return __extension__(__m128i)(__v2di){0LL, 0LL};
3867}
3868
3869/// Stores a 128-bit integer vector to a memory location aligned on a
3870/// 128-bit boundary.
3871///
3872/// \headerfile <x86intrin.h>
3873///
3874/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3875///
3876/// \param __p
3877/// A pointer to an aligned memory location that will receive the integer
3878/// values.
3879/// \param __b
3880/// A 128-bit integer vector containing the values to be moved.
3881static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3882 __m128i __b) {
3883 *__p = __b;
3884}
3885
3886/// Stores a 128-bit integer vector to an unaligned memory location.
3887///
3888/// \headerfile <x86intrin.h>
3889///
3890/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3891///
3892/// \param __p
3893/// A pointer to a memory location that will receive the integer values.
3894/// \param __b
3895/// A 128-bit integer vector containing the values to be moved.
3896static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3897 __m128i __b) {
3898 struct __storeu_si128 {
3899 __m128i_u __v;
3900 } __attribute__((__packed__, __may_alias__));
3901 ((struct __storeu_si128 *)__p)->__v = __b;
3902}
3903
3904/// Stores a 64-bit integer value from the low element of a 128-bit integer
3905/// vector.
3906///
3907/// \headerfile <x86intrin.h>
3908///
3909/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3910///
3911/// \param __p
3912/// A pointer to a 64-bit memory location. The address of the memory
3913/// location does not have to be aligned.
3914/// \param __b
3915/// A 128-bit integer vector containing the value to be stored.
3916static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3917 __m128i __b) {
3918 struct __storeu_si64 {
3919 long long __v;
3920 } __attribute__((__packed__, __may_alias__));
3921 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3922}
3923
3924/// Stores a 32-bit integer value from the low element of a 128-bit integer
3925/// vector.
3926///
3927/// \headerfile <x86intrin.h>
3928///
3929/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3930///
3931/// \param __p
3932/// A pointer to a 32-bit memory location. The address of the memory
3933/// location does not have to be aligned.
3934/// \param __b
3935/// A 128-bit integer vector containing the value to be stored.
3936static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3937 __m128i __b) {
3938 struct __storeu_si32 {
3939 int __v;
3940 } __attribute__((__packed__, __may_alias__));
3941 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3942}
3943
3944/// Stores a 16-bit integer value from the low element of a 128-bit integer
3945/// vector.
3946///
3947/// \headerfile <x86intrin.h>
3948///
3949/// This intrinsic does not correspond to a specific instruction.
3950///
3951/// \param __p
3952/// A pointer to a 16-bit memory location. The address of the memory
3953/// location does not have to be aligned.
3954/// \param __b
3955/// A 128-bit integer vector containing the value to be stored.
3956static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3957 __m128i __b) {
3958 struct __storeu_si16 {
3959 short __v;
3960 } __attribute__((__packed__, __may_alias__));
3961 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3962}
3963
3964/// Moves bytes selected by the mask from the first operand to the
3965/// specified unaligned memory location. When a mask bit is 1, the
3966/// corresponding byte is written, otherwise it is not written.
3967///
3968/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3969/// used again soon). Exception and trap behavior for elements not selected
3970/// for storage to memory are implementation dependent.
3971///
3972/// \headerfile <x86intrin.h>
3973///
3974/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3975/// instruction.
3976///
3977/// \param __d
3978/// A 128-bit integer vector containing the values to be moved.
3979/// \param __n
3980/// A 128-bit integer vector containing the mask. The most significant bit of
3981/// each byte represents the mask bits.
3982/// \param __p
3983/// A pointer to an unaligned 128-bit memory location where the specified
3984/// values are moved.
3985static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3986 __m128i __n,
3987 char *__p) {
3988 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3989}
3990
3991/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3992/// a memory location.
3993///
3994/// \headerfile <x86intrin.h>
3995///
3996/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3997///
3998/// \param __p
3999/// A pointer to a 64-bit memory location that will receive the lower 64 bits
4000/// of the integer vector parameter.
4001/// \param __a
4002/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4003/// value to be stored.
4004static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4005 __m128i __a) {
4006 struct __mm_storel_epi64_struct {
4007 long long __u;
4008 } __attribute__((__packed__, __may_alias__));
4009 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4010}
4011
4012/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4013/// aligned memory location.
4014///
4015/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4016/// used again soon).
4017///
4018/// \headerfile <x86intrin.h>
4019///
4020/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4021///
4022/// \param __p
4023/// A pointer to the 128-bit aligned memory location used to store the value.
4024/// \param __a
4025/// A vector of [2 x double] containing the 64-bit values to be stored.
4026static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4027 __m128d __a) {
4028 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4029}
4030
4031/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4032///
4033/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4034/// used again soon).
4035///
4036/// \headerfile <x86intrin.h>
4037///
4038/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4039///
4040/// \param __p
4041/// A pointer to the 128-bit aligned memory location used to store the value.
4042/// \param __a
4043/// A 128-bit integer vector containing the values to be stored.
4044static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4045 __m128i __a) {
4046 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4047}
4048
4049/// Stores a 32-bit integer value in the specified memory location.
4050///
4051/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4052/// used again soon).
4053///
4054/// \headerfile <x86intrin.h>
4055///
4056/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4057///
4058/// \param __p
4059/// A pointer to the 32-bit memory location used to store the value.
4060/// \param __a
4061/// A 32-bit integer containing the value to be stored.
4062static __inline__ void
4063 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4064 _mm_stream_si32(void *__p, int __a) {
4065 __builtin_ia32_movnti((int *)__p, __a);
4066}
4067
4068#ifdef __x86_64__
4069/// Stores a 64-bit integer value in the specified memory location.
4070///
4071/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4072/// used again soon).
4073///
4074/// \headerfile <x86intrin.h>
4075///
4076/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4077///
4078/// \param __p
4079/// A pointer to the 64-bit memory location used to store the value.
4080/// \param __a
4081/// A 64-bit integer containing the value to be stored.
4082static __inline__ void
4083 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4084 _mm_stream_si64(void *__p, long long __a) {
4085 __builtin_ia32_movnti64((long long *)__p, __a);
4086}
4087#endif
4088
4089#if defined(__cplusplus)
4090extern "C" {
4091#endif
4092
4093/// The cache line containing \a __p is flushed and invalidated from all
4094/// caches in the coherency domain.
4095///
4096/// \headerfile <x86intrin.h>
4097///
4098/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4099///
4100/// \param __p
4101/// A pointer to the memory location used to identify the cache line to be
4102/// flushed.
4103void _mm_clflush(void const *__p);
4104
4105/// Forces strong memory ordering (serialization) between load
4106/// instructions preceding this instruction and load instructions following
4107/// this instruction, ensuring the system completes all previous loads before
4108/// executing subsequent loads.
4109///
4110/// \headerfile <x86intrin.h>
4111///
4112/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4113///
4114void _mm_lfence(void);
4115
4116/// Forces strong memory ordering (serialization) between load and store
4117/// instructions preceding this instruction and load and store instructions
4118/// following this instruction, ensuring that the system completes all
4119/// previous memory accesses before executing subsequent memory accesses.
4120///
4121/// \headerfile <x86intrin.h>
4122///
4123/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4124///
4125void _mm_mfence(void);
4126
4127#if defined(__cplusplus)
4128} // extern "C"
4129#endif
4130
4131/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4132/// vector operands into 8-bit signed integers, and packs the results into
4133/// the destination.
4134///
4135/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4136/// less than 0x80 are saturated to 0x80.
4137///
4138/// \headerfile <x86intrin.h>
4139///
4140/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4141///
4142/// \param __a
4143/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4144/// written to the lower 64 bits of the result.
4145/// \param __b
4146/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4147/// written to the higher 64 bits of the result.
4148/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4149static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4150 __m128i __b) {
4151 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4152}
4153
4154/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4155/// vector operands into 16-bit signed integers, and packs the results into
4156/// the destination.
4157///
4158/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4159/// values less than 0x8000 are saturated to 0x8000.
4160///
4161/// \headerfile <x86intrin.h>
4162///
4163/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4164///
4165/// \param __a
4166/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4167/// are written to the lower 64 bits of the result.
4168/// \param __b
4169/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4170/// are written to the higher 64 bits of the result.
4171/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4172static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4173 __m128i __b) {
4174 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4175}
4176
4177/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4178/// vector operands into 8-bit unsigned integers, and packs the results into
4179/// the destination.
4180///
4181/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4182/// are saturated to 0x00.
4183///
4184/// \headerfile <x86intrin.h>
4185///
4186/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4187///
4188/// \param __a
4189/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4190/// written to the lower 64 bits of the result.
4191/// \param __b
4192/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4193/// written to the higher 64 bits of the result.
4194/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4195static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4196 __m128i __b) {
4197 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4198}
4199
4200/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4201/// the immediate-value parameter as a selector.
4202///
4203/// \headerfile <x86intrin.h>
4204///
4205/// \code
4206/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4207/// \endcode
4208///
4209/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4210///
4211/// \param a
4212/// A 128-bit integer vector.
4213/// \param imm
4214/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4215/// to bits[15:0] of the result. \n
4216/// 000: assign values from bits [15:0] of \a a. \n
4217/// 001: assign values from bits [31:16] of \a a. \n
4218/// 010: assign values from bits [47:32] of \a a. \n
4219/// 011: assign values from bits [63:48] of \a a. \n
4220/// 100: assign values from bits [79:64] of \a a. \n
4221/// 101: assign values from bits [95:80] of \a a. \n
4222/// 110: assign values from bits [111:96] of \a a. \n
4223/// 111: assign values from bits [127:112] of \a a.
4224/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4225/// integer vector parameter and the remaining bits are assigned zeros.
4226#define _mm_extract_epi16(a, imm) \
4227 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4228 (int)(imm)))
4229
4230/// Constructs a 128-bit integer vector by first making a copy of the
4231/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4232/// of an integer parameter into an offset specified by the immediate-value
4233/// parameter.
4234///
4235/// \headerfile <x86intrin.h>
4236///
4237/// \code
4238/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4239/// \endcode
4240///
4241/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4242///
4243/// \param a
4244/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4245/// result and then one of the eight elements in the result is replaced by
4246/// the lower 16 bits of \a b.
4247/// \param b
4248/// An integer. The lower 16 bits of this parameter are written to the
4249/// result beginning at an offset specified by \a imm.
4250/// \param imm
4251/// An immediate value specifying the bit offset in the result at which the
4252/// lower 16 bits of \a b are written.
4253/// \returns A 128-bit integer vector containing the constructed values.
4254#define _mm_insert_epi16(a, b, imm) \
4255 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4256 (int)(imm)))
4257
4258/// Copies the values of the most significant bits from each 8-bit
4259/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4260/// value, zero-extends the value, and writes it to the destination.
4261///
4262/// \headerfile <x86intrin.h>
4263///
4264/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4265///
4266/// \param __a
4267/// A 128-bit integer vector containing the values with bits to be extracted.
4268/// \returns The most significant bits from each 8-bit element in \a __a,
4269/// written to bits [15:0]. The other bits are assigned zeros.
4270static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4271 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4272}
4273
4274/// Constructs a 128-bit integer vector by shuffling four 32-bit
4275/// elements of a 128-bit integer vector parameter, using the immediate-value
4276/// parameter as a specifier.
4277///
4278/// \headerfile <x86intrin.h>
4279///
4280/// \code
4281/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4282/// \endcode
4283///
4284/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4285///
4286/// \param a
4287/// A 128-bit integer vector containing the values to be copied.
4288/// \param imm
4289/// An immediate value containing an 8-bit value specifying which elements to
4290/// copy from a. The destinations within the 128-bit destination are assigned
4291/// values as follows: \n
4292/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4293/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4294/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4295/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4296/// Bit value assignments: \n
4297/// 00: assign values from bits [31:0] of \a a. \n
4298/// 01: assign values from bits [63:32] of \a a. \n
4299/// 10: assign values from bits [95:64] of \a a. \n
4300/// 11: assign values from bits [127:96] of \a a. \n
4301/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303/// <c>[b6, b4, b2, b0]</c>.
4304/// \returns A 128-bit integer vector containing the shuffled values.
4305#define _mm_shuffle_epi32(a, imm) \
4306 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4307
4308/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4309/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4310/// value parameter as a specifier.
4311///
4312/// \headerfile <x86intrin.h>
4313///
4314/// \code
4315/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4316/// \endcode
4317///
4318/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4319///
4320/// \param a
4321/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4322/// [127:64] of the result.
4323/// \param imm
4324/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4325/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4326/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4327/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4328/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4329/// Bit value assignments: \n
4330/// 00: assign values from bits [15:0] of \a a. \n
4331/// 01: assign values from bits [31:16] of \a a. \n
4332/// 10: assign values from bits [47:32] of \a a. \n
4333/// 11: assign values from bits [63:48] of \a a. \n
4334/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4335/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4336/// <c>[b6, b4, b2, b0]</c>.
4337/// \returns A 128-bit integer vector containing the shuffled values.
4338#define _mm_shufflelo_epi16(a, imm) \
4339 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4340
4341/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4342/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4343/// value parameter as a specifier.
4344///
4345/// \headerfile <x86intrin.h>
4346///
4347/// \code
4348/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4349/// \endcode
4350///
4351/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4352///
4353/// \param a
4354/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4355/// [63:0] of the result.
4356/// \param imm
4357/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4358/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4359/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4360/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4361/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4362/// Bit value assignments: \n
4363/// 00: assign values from bits [79:64] of \a a. \n
4364/// 01: assign values from bits [95:80] of \a a. \n
4365/// 10: assign values from bits [111:96] of \a a. \n
4366/// 11: assign values from bits [127:112] of \a a. \n
4367/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4368/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4369/// <c>[b6, b4, b2, b0]</c>.
4370/// \returns A 128-bit integer vector containing the shuffled values.
4371#define _mm_shufflehi_epi16(a, imm) \
4372 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4373
4374/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4375/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4376///
4377/// \headerfile <x86intrin.h>
4378///
4379/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4380/// instruction.
4381///
4382/// \param __a
4383/// A 128-bit vector of [16 x i8].
4384/// Bits [71:64] are written to bits [7:0] of the result. \n
4385/// Bits [79:72] are written to bits [23:16] of the result. \n
4386/// Bits [87:80] are written to bits [39:32] of the result. \n
4387/// Bits [95:88] are written to bits [55:48] of the result. \n
4388/// Bits [103:96] are written to bits [71:64] of the result. \n
4389/// Bits [111:104] are written to bits [87:80] of the result. \n
4390/// Bits [119:112] are written to bits [103:96] of the result. \n
4391/// Bits [127:120] are written to bits [119:112] of the result.
4392/// \param __b
4393/// A 128-bit vector of [16 x i8]. \n
4394/// Bits [71:64] are written to bits [15:8] of the result. \n
4395/// Bits [79:72] are written to bits [31:24] of the result. \n
4396/// Bits [87:80] are written to bits [47:40] of the result. \n
4397/// Bits [95:88] are written to bits [63:56] of the result. \n
4398/// Bits [103:96] are written to bits [79:72] of the result. \n
4399/// Bits [111:104] are written to bits [95:88] of the result. \n
4400/// Bits [119:112] are written to bits [111:104] of the result. \n
4401/// Bits [127:120] are written to bits [127:120] of the result.
4402/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4403static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4404 __m128i __b) {
4405 return (__m128i)__builtin_shufflevector(
4406 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4407 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4408}
4409
4410/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4411/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4412///
4413/// \headerfile <x86intrin.h>
4414///
4415/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4416/// instruction.
4417///
4418/// \param __a
4419/// A 128-bit vector of [8 x i16].
4420/// Bits [79:64] are written to bits [15:0] of the result. \n
4421/// Bits [95:80] are written to bits [47:32] of the result. \n
4422/// Bits [111:96] are written to bits [79:64] of the result. \n
4423/// Bits [127:112] are written to bits [111:96] of the result.
4424/// \param __b
4425/// A 128-bit vector of [8 x i16].
4426/// Bits [79:64] are written to bits [31:16] of the result. \n
4427/// Bits [95:80] are written to bits [63:48] of the result. \n
4428/// Bits [111:96] are written to bits [95:80] of the result. \n
4429/// Bits [127:112] are written to bits [127:112] of the result.
4430/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4431static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4432 __m128i __b) {
4433 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4434 8 + 5, 6, 8 + 6, 7, 8 + 7);
4435}
4436
4437/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4438/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4439///
4440/// \headerfile <x86intrin.h>
4441///
4442/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4443/// instruction.
4444///
4445/// \param __a
4446/// A 128-bit vector of [4 x i32]. \n
4447/// Bits [95:64] are written to bits [31:0] of the destination. \n
4448/// Bits [127:96] are written to bits [95:64] of the destination.
4449/// \param __b
4450/// A 128-bit vector of [4 x i32]. \n
4451/// Bits [95:64] are written to bits [64:32] of the destination. \n
4452/// Bits [127:96] are written to bits [127:96] of the destination.
4453/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4454static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4455 __m128i __b) {
4456 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4457 4 + 3);
4458}
4459
4460/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4461/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4462///
4463/// \headerfile <x86intrin.h>
4464///
4465/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4466/// instruction.
4467///
4468/// \param __a
4469/// A 128-bit vector of [2 x i64]. \n
4470/// Bits [127:64] are written to bits [63:0] of the destination.
4471/// \param __b
4472/// A 128-bit vector of [2 x i64]. \n
4473/// Bits [127:64] are written to bits [127:64] of the destination.
4474/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4475static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4476 __m128i __b) {
4477 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4478}
4479
4480/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4481/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4482///
4483/// \headerfile <x86intrin.h>
4484///
4485/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4486/// instruction.
4487///
4488/// \param __a
4489/// A 128-bit vector of [16 x i8]. \n
4490/// Bits [7:0] are written to bits [7:0] of the result. \n
4491/// Bits [15:8] are written to bits [23:16] of the result. \n
4492/// Bits [23:16] are written to bits [39:32] of the result. \n
4493/// Bits [31:24] are written to bits [55:48] of the result. \n
4494/// Bits [39:32] are written to bits [71:64] of the result. \n
4495/// Bits [47:40] are written to bits [87:80] of the result. \n
4496/// Bits [55:48] are written to bits [103:96] of the result. \n
4497/// Bits [63:56] are written to bits [119:112] of the result.
4498/// \param __b
4499/// A 128-bit vector of [16 x i8].
4500/// Bits [7:0] are written to bits [15:8] of the result. \n
4501/// Bits [15:8] are written to bits [31:24] of the result. \n
4502/// Bits [23:16] are written to bits [47:40] of the result. \n
4503/// Bits [31:24] are written to bits [63:56] of the result. \n
4504/// Bits [39:32] are written to bits [79:72] of the result. \n
4505/// Bits [47:40] are written to bits [95:88] of the result. \n
4506/// Bits [55:48] are written to bits [111:104] of the result. \n
4507/// Bits [63:56] are written to bits [127:120] of the result.
4508/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4509static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4510 __m128i __b) {
4511 return (__m128i)__builtin_shufflevector(
4512 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4513 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4514}
4515
4516/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4517/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4518/// [8 x i16].
4519///
4520/// \headerfile <x86intrin.h>
4521///
4522/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4523/// instruction.
4524///
4525/// \param __a
4526/// A 128-bit vector of [8 x i16].
4527/// Bits [15:0] are written to bits [15:0] of the result. \n
4528/// Bits [31:16] are written to bits [47:32] of the result. \n
4529/// Bits [47:32] are written to bits [79:64] of the result. \n
4530/// Bits [63:48] are written to bits [111:96] of the result.
4531/// \param __b
4532/// A 128-bit vector of [8 x i16].
4533/// Bits [15:0] are written to bits [31:16] of the result. \n
4534/// Bits [31:16] are written to bits [63:48] of the result. \n
4535/// Bits [47:32] are written to bits [95:80] of the result. \n
4536/// Bits [63:48] are written to bits [127:112] of the result.
4537/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4538static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4539 __m128i __b) {
4540 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4541 8 + 1, 2, 8 + 2, 3, 8 + 3);
4542}
4543
4544/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4545/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4546///
4547/// \headerfile <x86intrin.h>
4548///
4549/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4550/// instruction.
4551///
4552/// \param __a
4553/// A 128-bit vector of [4 x i32]. \n
4554/// Bits [31:0] are written to bits [31:0] of the destination. \n
4555/// Bits [63:32] are written to bits [95:64] of the destination.
4556/// \param __b
4557/// A 128-bit vector of [4 x i32]. \n
4558/// Bits [31:0] are written to bits [64:32] of the destination. \n
4559/// Bits [63:32] are written to bits [127:96] of the destination.
4560/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4561static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4562 __m128i __b) {
4563 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4564 4 + 1);
4565}
4566
4567/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4568/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4569///
4570/// \headerfile <x86intrin.h>
4571///
4572/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4573/// instruction.
4574///
4575/// \param __a
4576/// A 128-bit vector of [2 x i64]. \n
4577/// Bits [63:0] are written to bits [63:0] of the destination. \n
4578/// \param __b
4579/// A 128-bit vector of [2 x i64]. \n
4580/// Bits [63:0] are written to bits [127:64] of the destination. \n
4581/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4582static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4583 __m128i __b) {
4584 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4585}
4586
4587/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4588/// integer.
4589///
4590/// \headerfile <x86intrin.h>
4591///
4592/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4593///
4594/// \param __a
4595/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4596/// destination.
4597/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4598static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4599 return (__m64)__a[0];
4600}
4601
4602/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4603/// upper bits.
4604///
4605/// \headerfile <x86intrin.h>
4606///
4607/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4608///
4609/// \param __a
4610/// A 64-bit value.
4611/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4612/// the operand. The upper 64 bits are assigned zeros.
4613static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4614 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4615}
4616
4617/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4618/// integer vector, zeroing the upper bits.
4619///
4620/// \headerfile <x86intrin.h>
4621///
4622/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4623///
4624/// \param __a
4625/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4626/// destination.
4627/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4628/// the operand. The upper 64 bits are assigned zeros.
4629static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4630 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4631}
4632
4633/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4634/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4635/// double].
4636///
4637/// \headerfile <x86intrin.h>
4638///
4639/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4640///
4641/// \param __a
4642/// A 128-bit vector of [2 x double]. \n
4643/// Bits [127:64] are written to bits [63:0] of the destination.
4644/// \param __b
4645/// A 128-bit vector of [2 x double]. \n
4646/// Bits [127:64] are written to bits [127:64] of the destination.
4647/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4648static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4649 __m128d __b) {
4650 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4651}
4652
4653/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4654/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4655/// double].
4656///
4657/// \headerfile <x86intrin.h>
4658///
4659/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4660///
4661/// \param __a
4662/// A 128-bit vector of [2 x double]. \n
4663/// Bits [63:0] are written to bits [63:0] of the destination.
4664/// \param __b
4665/// A 128-bit vector of [2 x double]. \n
4666/// Bits [63:0] are written to bits [127:64] of the destination.
4667/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4668static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4669 __m128d __b) {
4670 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4671}
4672
4673/// Extracts the sign bits of the double-precision values in the 128-bit
4674/// vector of [2 x double], zero-extends the value, and writes it to the
4675/// low-order bits of the destination.
4676///
4677/// \headerfile <x86intrin.h>
4678///
4679/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4680///
4681/// \param __a
4682/// A 128-bit vector of [2 x double] containing the values with sign bits to
4683/// be extracted.
4684/// \returns The sign bits from each of the double-precision elements in \a __a,
4685/// written to bits [1:0]. The remaining bits are assigned values of zero.
4686static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4687 return __builtin_ia32_movmskpd((__v2df)__a);
4688}
4689
4690/// Constructs a 128-bit floating-point vector of [2 x double] from two
4691/// 128-bit vector parameters of [2 x double], using the immediate-value
4692/// parameter as a specifier.
4693///
4694/// \headerfile <x86intrin.h>
4695///
4696/// \code
4697/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4698/// \endcode
4699///
4700/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4701///
4702/// \param a
4703/// A 128-bit vector of [2 x double].
4704/// \param b
4705/// A 128-bit vector of [2 x double].
4706/// \param i
4707/// An 8-bit immediate value. The least significant two bits specify which
4708/// elements to copy from \a a and \a b: \n
4709/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4710/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4711/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4712/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4713/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4714/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4715/// <c>[b1, b0]</c>.
4716/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4717#define _mm_shuffle_pd(a, b, i) \
4718 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4719 (int)(i)))
4720
4721/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4722/// floating-point vector of [4 x float].
4723///
4724/// \headerfile <x86intrin.h>
4725///
4726/// This intrinsic has no corresponding instruction.
4727///
4728/// \param __a
4729/// A 128-bit floating-point vector of [2 x double].
4730/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4731/// bitwise pattern as the parameter.
4732static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4733 return (__m128)__a;
4734}
4735
4736/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4737/// integer vector.
4738///
4739/// \headerfile <x86intrin.h>
4740///
4741/// This intrinsic has no corresponding instruction.
4742///
4743/// \param __a
4744/// A 128-bit floating-point vector of [2 x double].
4745/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4746/// parameter.
4747static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4748 return (__m128i)__a;
4749}
4750
4751/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4752/// floating-point vector of [2 x double].
4753///
4754/// \headerfile <x86intrin.h>
4755///
4756/// This intrinsic has no corresponding instruction.
4757///
4758/// \param __a
4759/// A 128-bit floating-point vector of [4 x float].
4760/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4761/// bitwise pattern as the parameter.
4762static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4763 return (__m128d)__a;
4764}
4765
4766/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4767/// integer vector.
4768///
4769/// \headerfile <x86intrin.h>
4770///
4771/// This intrinsic has no corresponding instruction.
4772///
4773/// \param __a
4774/// A 128-bit floating-point vector of [4 x float].
4775/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4776/// parameter.
4777static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4778 return (__m128i)__a;
4779}
4780
4781/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4782/// of [4 x float].
4783///
4784/// \headerfile <x86intrin.h>
4785///
4786/// This intrinsic has no corresponding instruction.
4787///
4788/// \param __a
4789/// A 128-bit integer vector.
4790/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4791/// bitwise pattern as the parameter.
4792static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4793 return (__m128)__a;
4794}
4795
4796/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4797/// of [2 x double].
4798///
4799/// \headerfile <x86intrin.h>
4800///
4801/// This intrinsic has no corresponding instruction.
4802///
4803/// \param __a
4804/// A 128-bit integer vector.
4805/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4806/// bitwise pattern as the parameter.
4807static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4808 return (__m128d)__a;
4809}
4810
4811/// Compares each of the corresponding double-precision values of two
4812/// 128-bit vectors of [2 x double], using the operation specified by the
4813/// immediate integer operand.
4814///
4815/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4816/// If either value in a comparison is NaN, comparisons that are ordered
4817/// return false, and comparisons that are unordered return true.
4818///
4819/// \headerfile <x86intrin.h>
4820///
4821/// \code
4822/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4823/// \endcode
4824///
4825/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4826///
4827/// \param a
4828/// A 128-bit vector of [2 x double].
4829/// \param b
4830/// A 128-bit vector of [2 x double].
4831/// \param c
4832/// An immediate integer operand, with bits [4:0] specifying which comparison
4833/// operation to use: \n
4834/// 0x00: Equal (ordered, non-signaling) \n
4835/// 0x01: Less-than (ordered, signaling) \n
4836/// 0x02: Less-than-or-equal (ordered, signaling) \n
4837/// 0x03: Unordered (non-signaling) \n
4838/// 0x04: Not-equal (unordered, non-signaling) \n
4839/// 0x05: Not-less-than (unordered, signaling) \n
4840/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4841/// 0x07: Ordered (non-signaling) \n
4842/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4843#define _mm_cmp_pd(a, b, c) \
4844 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4845 (c)))
4846
4847/// Compares each of the corresponding scalar double-precision values of
4848/// two 128-bit vectors of [2 x double], using the operation specified by the
4849/// immediate integer operand.
4850///
4851/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4852/// If either value in a comparison is NaN, comparisons that are ordered
4853/// return false, and comparisons that are unordered return true.
4854///
4855/// \headerfile <x86intrin.h>
4856///
4857/// \code
4858/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4859/// \endcode
4860///
4861/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4862///
4863/// \param a
4864/// A 128-bit vector of [2 x double].
4865/// \param b
4866/// A 128-bit vector of [2 x double].
4867/// \param c
4868/// An immediate integer operand, with bits [4:0] specifying which comparison
4869/// operation to use: \n
4870/// 0x00: Equal (ordered, non-signaling) \n
4871/// 0x01: Less-than (ordered, signaling) \n
4872/// 0x02: Less-than-or-equal (ordered, signaling) \n
4873/// 0x03: Unordered (non-signaling) \n
4874/// 0x04: Not-equal (unordered, non-signaling) \n
4875/// 0x05: Not-less-than (unordered, signaling) \n
4876/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4877/// 0x07: Ordered (non-signaling) \n
4878/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4879#define _mm_cmp_sd(a, b, c) \
4880 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4881 (c)))
4882
4883#if defined(__cplusplus)
4884extern "C" {
4885#endif
4886
4887/// Indicates that a spin loop is being executed for the purposes of
4888/// optimizing power consumption during the loop.
4889///
4890/// \headerfile <x86intrin.h>
4891///
4892/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4893///
4894void _mm_pause(void);
4895
4896#if defined(__cplusplus)
4897} // extern "C"
4898#endif
4899
4900#undef __anyext128
4901#undef __trunc64
4902#undef __DEFAULT_FN_ATTRS
4903
4904#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4905
4906#define _MM_DENORMALS_ZERO_ON (0x0040U)
4907#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4908
4909#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4910
4911#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4912#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4913 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4914
4915#endif /* __EMMINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3749
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1053
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4538
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4613
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1962
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3592
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1029
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1814
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1532
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4195
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2367
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:594
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:83
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:221
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:141
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4747
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:407
#define __anyext128(x)
Definition: emmintrin.h:64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1655
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4044
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4270
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2818
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2669
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:829
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1195
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1618
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2566
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3417
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3985
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1171
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3552
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1219
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2162
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1562
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1324
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3082
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3009
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1798
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3222
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1832
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2515
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:751
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:202
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4004
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3242
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2704
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:528
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3956
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:307
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1693
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1674
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4648
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2442
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:776
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2688
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3027
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3101
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2424
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1147
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2873
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:424
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2329
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2266
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1942
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4807
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2991
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:802
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3162
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:982
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:57
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4668
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2588
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:726
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:678
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4582
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2911
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4686
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3120
static __inline__ void int __a
Definition: emmintrin.h:4064
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:162
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2652
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3936
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1498
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4629
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:486
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4475
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1479
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3320
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3386
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4561
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3182
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1435
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:262
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1885
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4431
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1346
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3484
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4777
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1243
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2291
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2228
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3371
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2550
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1636
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3045
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2764
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:615
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1388
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1365
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:123
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1744
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2206
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2184
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2610
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2498
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1764
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4454
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3850
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2386
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1101
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:573
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2405
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:390
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1005
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2310
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4732
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2854
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4149
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3696
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2892
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1602
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1077
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:657
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4762
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2018
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2800
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2481
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:956
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:856
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2037
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1923
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2103
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3447
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3730
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4172
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2782
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2061
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:931
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:507
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3432
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:701
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4026
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3202
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4792
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1866
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:370
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3504
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2120
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1410
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2460
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:906
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3063
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:881
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3357
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3803
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1718
#define __trunc64(x)
Definition: emmintrin.h:62
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3641
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3679
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:245
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1852
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3525
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:286
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2836
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:101
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3916
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1902
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1578
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:353
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2348
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4598
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3881
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:180
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:444
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1782
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:332
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1547
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3467
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3713
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3662
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3401
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2532
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2247
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2001
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4403
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1517
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2973
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1979
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1267
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2082
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2631
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1303
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3865
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1125
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:550
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:465
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3896
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2930
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3771
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1459
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2721
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:636
struct __storeu_i16 *__P __v
Definition: immintrin.h:472