clang 22.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20
21typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
22typedef long long __m128i_u
23 __attribute__((__vector_size__(16), __aligned__(1)));
24
25/* Type defines. */
26typedef double __v2df __attribute__((__vector_size__(16)));
27
28/* Unsigned types */
29typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
30
31/* We need an explicitly signed variant for char. Note that this shouldn't
32 * appear in the interface though. */
33typedef signed char __v16qs __attribute__((__vector_size__(16)));
34
35#ifdef __SSE2__
36/* Both _Float16 and __bf16 require SSE2 being enabled. */
37typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
38typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
39typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
40
41typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
42typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
43#endif
44
45/* Define the default attributes for the functions in this file. */
46#define __DEFAULT_FN_ATTRS \
47 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
48 __min_vector_width__(128)))
49
50#if defined(__cplusplus) && (__cplusplus >= 201103L)
51#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
52#else
53#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
54#endif
55
56#define __trunc64(x) \
57 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
58#define __zext128(x) \
59 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
60 1, 2, 3)
61#define __anyext128(x) \
62 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
63 1, -1, -1)
64
65/// Adds lower double-precision values in both operands and returns the
66/// sum in the lower 64 bits of the result. The upper 64 bits of the result
67/// are copied from the upper double-precision value of the first operand.
68///
69/// \headerfile <x86intrin.h>
70///
71/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
72///
73/// \param __a
74/// A 128-bit vector of [2 x double] containing one of the source operands.
75/// \param __b
76/// A 128-bit vector of [2 x double] containing one of the source operands.
77/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
78/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
79/// from the upper 64 bits of the first source operand.
80static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a,
81 __m128d __b) {
82 __a[0] += __b[0];
83 return __a;
84}
85
86/// Adds two 128-bit vectors of [2 x double].
87///
88/// \headerfile <x86intrin.h>
89///
90/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
91///
92/// \param __a
93/// A 128-bit vector of [2 x double] containing one of the source operands.
94/// \param __b
95/// A 128-bit vector of [2 x double] containing one of the source operands.
96/// \returns A 128-bit vector of [2 x double] containing the sums of both
97/// operands.
98static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a,
99 __m128d __b) {
100 return (__m128d)((__v2df)__a + (__v2df)__b);
101}
102
103/// Subtracts the lower double-precision value of the second operand
104/// from the lower double-precision value of the first operand and returns
105/// the difference in the lower 64 bits of the result. The upper 64 bits of
106/// the result are copied from the upper double-precision value of the first
107/// operand.
108///
109/// \headerfile <x86intrin.h>
110///
111/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
112///
113/// \param __a
114/// A 128-bit vector of [2 x double] containing the minuend.
115/// \param __b
116/// A 128-bit vector of [2 x double] containing the subtrahend.
117/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
118/// difference of the lower 64 bits of both operands. The upper 64 bits are
119/// copied from the upper 64 bits of the first source operand.
120static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a,
121 __m128d __b) {
122 __a[0] -= __b[0];
123 return __a;
124}
125
126/// Subtracts two 128-bit vectors of [2 x double].
127///
128/// \headerfile <x86intrin.h>
129///
130/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
131///
132/// \param __a
133/// A 128-bit vector of [2 x double] containing the minuend.
134/// \param __b
135/// A 128-bit vector of [2 x double] containing the subtrahend.
136/// \returns A 128-bit vector of [2 x double] containing the differences between
137/// both operands.
138static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a,
139 __m128d __b) {
140 return (__m128d)((__v2df)__a - (__v2df)__b);
141}
142
143/// Multiplies lower double-precision values in both operands and returns
144/// the product in the lower 64 bits of the result. The upper 64 bits of the
145/// result are copied from the upper double-precision value of the first
146/// operand.
147///
148/// \headerfile <x86intrin.h>
149///
150/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
151///
152/// \param __a
153/// A 128-bit vector of [2 x double] containing one of the source operands.
154/// \param __b
155/// A 128-bit vector of [2 x double] containing one of the source operands.
156/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
157/// product of the lower 64 bits of both operands. The upper 64 bits are
158/// copied from the upper 64 bits of the first source operand.
159static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a,
160 __m128d __b) {
161 __a[0] *= __b[0];
162 return __a;
163}
164
165/// Multiplies two 128-bit vectors of [2 x double].
166///
167/// \headerfile <x86intrin.h>
168///
169/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
170///
171/// \param __a
172/// A 128-bit vector of [2 x double] containing one of the operands.
173/// \param __b
174/// A 128-bit vector of [2 x double] containing one of the operands.
175/// \returns A 128-bit vector of [2 x double] containing the products of both
176/// operands.
177static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a,
178 __m128d __b) {
179 return (__m128d)((__v2df)__a * (__v2df)__b);
180}
181
182/// Divides the lower double-precision value of the first operand by the
183/// lower double-precision value of the second operand and returns the
184/// quotient in the lower 64 bits of the result. The upper 64 bits of the
185/// result are copied from the upper double-precision value of the first
186/// operand.
187///
188/// \headerfile <x86intrin.h>
189///
190/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
191///
192/// \param __a
193/// A 128-bit vector of [2 x double] containing the dividend.
194/// \param __b
195/// A 128-bit vector of [2 x double] containing divisor.
196/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
197/// quotient of the lower 64 bits of both operands. The upper 64 bits are
198/// copied from the upper 64 bits of the first source operand.
199static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a,
200 __m128d __b) {
201 __a[0] /= __b[0];
202 return __a;
203}
204
205/// Performs an element-by-element division of two 128-bit vectors of
206/// [2 x double].
207///
208/// \headerfile <x86intrin.h>
209///
210/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
211///
212/// \param __a
213/// A 128-bit vector of [2 x double] containing the dividend.
214/// \param __b
215/// A 128-bit vector of [2 x double] containing the divisor.
216/// \returns A 128-bit vector of [2 x double] containing the quotients of both
217/// operands.
218static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
219 __m128d __b) {
220 return (__m128d)((__v2df)__a / (__v2df)__b);
221}
222
223/// Calculates the square root of the lower double-precision value of
224/// the second operand and returns it in the lower 64 bits of the result.
225/// The upper 64 bits of the result are copied from the upper
226/// double-precision value of the first operand.
227///
228/// \headerfile <x86intrin.h>
229///
230/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
231///
232/// \param __a
233/// A 128-bit vector of [2 x double] containing one of the operands. The
234/// upper 64 bits of this operand are copied to the upper 64 bits of the
235/// result.
236/// \param __b
237/// A 128-bit vector of [2 x double] containing one of the operands. The
238/// square root is calculated using the lower 64 bits of this operand.
239/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
240/// square root of the lower 64 bits of operand \a __b, and whose upper 64
241/// bits are copied from the upper 64 bits of operand \a __a.
242static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
243 __m128d __b) {
244 return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
245}
246
247/// Calculates the square root of the each of two values stored in a
248/// 128-bit vector of [2 x double].
249///
250/// \headerfile <x86intrin.h>
251///
252/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
253///
254/// \param __a
255/// A 128-bit vector of [2 x double].
256/// \returns A 128-bit vector of [2 x double] containing the square roots of the
257/// values in the operand.
258static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
259 return __builtin_elementwise_sqrt(__a);
260}
261
262/// Compares lower 64-bit double-precision values of both operands, and
263/// returns the lesser of the pair of values in the lower 64-bits of the
264/// result. The upper 64 bits of the result are copied from the upper
265/// double-precision value of the first operand.
266///
267/// If either value in a comparison is NaN, returns the value from \a __b.
268///
269/// \headerfile <x86intrin.h>
270///
271/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
272///
273/// \param __a
274/// A 128-bit vector of [2 x double] containing one of the operands. The
275/// lower 64 bits of this operand are used in the comparison.
276/// \param __b
277/// A 128-bit vector of [2 x double] containing one of the operands. The
278/// lower 64 bits of this operand are used in the comparison.
279/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
280/// minimum value between both operands. The upper 64 bits are copied from
281/// the upper 64 bits of the first source operand.
282static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
283 __m128d __b) {
284 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
285}
286
287/// Performs element-by-element comparison of the two 128-bit vectors of
288/// [2 x double] and returns a vector containing the lesser of each pair of
289/// values.
290///
291/// If either value in a comparison is NaN, returns the value from \a __b.
292///
293/// \headerfile <x86intrin.h>
294///
295/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
296///
297/// \param __a
298/// A 128-bit vector of [2 x double] containing one of the operands.
299/// \param __b
300/// A 128-bit vector of [2 x double] containing one of the operands.
301/// \returns A 128-bit vector of [2 x double] containing the minimum values
302/// between both operands.
303static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
304 __m128d __b) {
305 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
306}
307
308/// Compares lower 64-bit double-precision values of both operands, and
309/// returns the greater of the pair of values in the lower 64-bits of the
310/// result. The upper 64 bits of the result are copied from the upper
311/// double-precision value of the first operand.
312///
313/// If either value in a comparison is NaN, returns the value from \a __b.
314///
315/// \headerfile <x86intrin.h>
316///
317/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
318///
319/// \param __a
320/// A 128-bit vector of [2 x double] containing one of the operands. The
321/// lower 64 bits of this operand are used in the comparison.
322/// \param __b
323/// A 128-bit vector of [2 x double] containing one of the operands. The
324/// lower 64 bits of this operand are used in the comparison.
325/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
326/// maximum value between both operands. The upper 64 bits are copied from
327/// the upper 64 bits of the first source operand.
328static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
329 __m128d __b) {
330 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
331}
332
333/// Performs element-by-element comparison of the two 128-bit vectors of
334/// [2 x double] and returns a vector containing the greater of each pair
335/// of values.
336///
337/// If either value in a comparison is NaN, returns the value from \a __b.
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
342///
343/// \param __a
344/// A 128-bit vector of [2 x double] containing one of the operands.
345/// \param __b
346/// A 128-bit vector of [2 x double] containing one of the operands.
347/// \returns A 128-bit vector of [2 x double] containing the maximum values
348/// between both operands.
349static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
350 __m128d __b) {
351 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
352}
353
354/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
355///
356/// \headerfile <x86intrin.h>
357///
358/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
359///
360/// \param __a
361/// A 128-bit vector of [2 x double] containing one of the source operands.
362/// \param __b
363/// A 128-bit vector of [2 x double] containing one of the source operands.
364/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
365/// values between both operands.
366static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a,
367 __m128d __b) {
368 return (__m128d)((__v2du)__a & (__v2du)__b);
369}
370
371/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
372/// the one's complement of the values contained in the first source operand.
373///
374/// \headerfile <x86intrin.h>
375///
376/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
377///
378/// \param __a
379/// A 128-bit vector of [2 x double] containing the left source operand. The
380/// one's complement of this value is used in the bitwise AND.
381/// \param __b
382/// A 128-bit vector of [2 x double] containing the right source operand.
383/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
384/// values in the second operand and the one's complement of the first
385/// operand.
386static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
387_mm_andnot_pd(__m128d __a, __m128d __b) {
388 return (__m128d)(~(__v2du)__a & (__v2du)__b);
389}
390
391/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
392///
393/// \headerfile <x86intrin.h>
394///
395/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
396///
397/// \param __a
398/// A 128-bit vector of [2 x double] containing one of the source operands.
399/// \param __b
400/// A 128-bit vector of [2 x double] containing one of the source operands.
401/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
402/// values between both operands.
403static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a,
404 __m128d __b) {
405 return (__m128d)((__v2du)__a | (__v2du)__b);
406}
407
408/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
409///
410/// \headerfile <x86intrin.h>
411///
412/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
413///
414/// \param __a
415/// A 128-bit vector of [2 x double] containing one of the source operands.
416/// \param __b
417/// A 128-bit vector of [2 x double] containing one of the source operands.
418/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
419/// values between both operands.
420static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a,
421 __m128d __b) {
422 return (__m128d)((__v2du)__a ^ (__v2du)__b);
423}
424
425/// Compares each of the corresponding double-precision values of the
426/// 128-bit vectors of [2 x double] for equality.
427///
428/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
429/// If either value in a comparison is NaN, returns false.
430///
431/// \headerfile <x86intrin.h>
432///
433/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
434///
435/// \param __a
436/// A 128-bit vector of [2 x double].
437/// \param __b
438/// A 128-bit vector of [2 x double].
439/// \returns A 128-bit vector containing the comparison results.
440static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
441 __m128d __b) {
442 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
443}
444
445/// Compares each of the corresponding double-precision values of the
446/// 128-bit vectors of [2 x double] to determine if the values in the first
447/// operand are less than those in the second operand.
448///
449/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
450/// If either value in a comparison is NaN, returns false.
451///
452/// \headerfile <x86intrin.h>
453///
454/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
455///
456/// \param __a
457/// A 128-bit vector of [2 x double].
458/// \param __b
459/// A 128-bit vector of [2 x double].
460/// \returns A 128-bit vector containing the comparison results.
461static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
462 __m128d __b) {
463 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
464}
465
466/// Compares each of the corresponding double-precision values of the
467/// 128-bit vectors of [2 x double] to determine if the values in the first
468/// operand are less than or equal to those in the second operand.
469///
470/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
471/// If either value in a comparison is NaN, returns false.
472///
473/// \headerfile <x86intrin.h>
474///
475/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
476///
477/// \param __a
478/// A 128-bit vector of [2 x double].
479/// \param __b
480/// A 128-bit vector of [2 x double].
481/// \returns A 128-bit vector containing the comparison results.
482static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
483 __m128d __b) {
484 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
485}
486
487/// Compares each of the corresponding double-precision values of the
488/// 128-bit vectors of [2 x double] to determine if the values in the first
489/// operand are greater than those in the second operand.
490///
491/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
492/// If either value in a comparison is NaN, returns false.
493///
494/// \headerfile <x86intrin.h>
495///
496/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
497///
498/// \param __a
499/// A 128-bit vector of [2 x double].
500/// \param __b
501/// A 128-bit vector of [2 x double].
502/// \returns A 128-bit vector containing the comparison results.
503static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
504 __m128d __b) {
505 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
506}
507
508/// Compares each of the corresponding double-precision values of the
509/// 128-bit vectors of [2 x double] to determine if the values in the first
510/// operand are greater than or equal to those in the second operand.
511///
512/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
513/// If either value in a comparison is NaN, returns false.
514///
515/// \headerfile <x86intrin.h>
516///
517/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
518///
519/// \param __a
520/// A 128-bit vector of [2 x double].
521/// \param __b
522/// A 128-bit vector of [2 x double].
523/// \returns A 128-bit vector containing the comparison results.
524static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
525 __m128d __b) {
526 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
527}
528
529/// Compares each of the corresponding double-precision values of the
530/// 128-bit vectors of [2 x double] to determine if the values in the first
531/// operand are ordered with respect to those in the second operand.
532///
533/// A pair of double-precision values are ordered with respect to each
534/// other if neither value is a NaN. Each comparison returns 0x0 for false,
535/// 0xFFFFFFFFFFFFFFFF for true.
536///
537/// \headerfile <x86intrin.h>
538///
539/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
540///
541/// \param __a
542/// A 128-bit vector of [2 x double].
543/// \param __b
544/// A 128-bit vector of [2 x double].
545/// \returns A 128-bit vector containing the comparison results.
546static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
547 __m128d __b) {
548 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
549}
550
551/// Compares each of the corresponding double-precision values of the
552/// 128-bit vectors of [2 x double] to determine if the values in the first
553/// operand are unordered with respect to those in the second operand.
554///
555/// A pair of double-precision values are unordered with respect to each
556/// other if one or both values are NaN. Each comparison returns 0x0 for
557/// false, 0xFFFFFFFFFFFFFFFF for true.
558///
559/// \headerfile <x86intrin.h>
560///
561/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
562/// instruction.
563///
564/// \param __a
565/// A 128-bit vector of [2 x double].
566/// \param __b
567/// A 128-bit vector of [2 x double].
568/// \returns A 128-bit vector containing the comparison results.
569static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
570 __m128d __b) {
571 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
572}
573
574/// Compares each of the corresponding double-precision values of the
575/// 128-bit vectors of [2 x double] to determine if the values in the first
576/// operand are unequal to those in the second operand.
577///
578/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579/// If either value in a comparison is NaN, returns true.
580///
581/// \headerfile <x86intrin.h>
582///
583/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
584///
585/// \param __a
586/// A 128-bit vector of [2 x double].
587/// \param __b
588/// A 128-bit vector of [2 x double].
589/// \returns A 128-bit vector containing the comparison results.
590static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
591 __m128d __b) {
592 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
593}
594
595/// Compares each of the corresponding double-precision values of the
596/// 128-bit vectors of [2 x double] to determine if the values in the first
597/// operand are not less than those in the second operand.
598///
599/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
600/// If either value in a comparison is NaN, returns true.
601///
602/// \headerfile <x86intrin.h>
603///
604/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
605///
606/// \param __a
607/// A 128-bit vector of [2 x double].
608/// \param __b
609/// A 128-bit vector of [2 x double].
610/// \returns A 128-bit vector containing the comparison results.
611static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
612 __m128d __b) {
613 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
614}
615
616/// Compares each of the corresponding double-precision values of the
617/// 128-bit vectors of [2 x double] to determine if the values in the first
618/// operand are not less than or equal to those in the second operand.
619///
620/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
621/// If either value in a comparison is NaN, returns true.
622///
623/// \headerfile <x86intrin.h>
624///
625/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
626///
627/// \param __a
628/// A 128-bit vector of [2 x double].
629/// \param __b
630/// A 128-bit vector of [2 x double].
631/// \returns A 128-bit vector containing the comparison results.
632static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
633 __m128d __b) {
634 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
635}
636
637/// Compares each of the corresponding double-precision values of the
638/// 128-bit vectors of [2 x double] to determine if the values in the first
639/// operand are not greater than those in the second operand.
640///
641/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
642/// If either value in a comparison is NaN, returns true.
643///
644/// \headerfile <x86intrin.h>
645///
646/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
647///
648/// \param __a
649/// A 128-bit vector of [2 x double].
650/// \param __b
651/// A 128-bit vector of [2 x double].
652/// \returns A 128-bit vector containing the comparison results.
653static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
654 __m128d __b) {
655 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
656}
657
658/// Compares each of the corresponding double-precision values of the
659/// 128-bit vectors of [2 x double] to determine if the values in the first
660/// operand are not greater than or equal to those in the second operand.
661///
662/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
663/// If either value in a comparison is NaN, returns true.
664///
665/// \headerfile <x86intrin.h>
666///
667/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
668///
669/// \param __a
670/// A 128-bit vector of [2 x double].
671/// \param __b
672/// A 128-bit vector of [2 x double].
673/// \returns A 128-bit vector containing the comparison results.
674static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
675 __m128d __b) {
676 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
677}
678
679/// Compares the lower double-precision floating-point values in each of
680/// the two 128-bit floating-point vectors of [2 x double] for equality.
681///
682/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
683/// If either value in a comparison is NaN, returns false.
684///
685/// \headerfile <x86intrin.h>
686///
687/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
688///
689/// \param __a
690/// A 128-bit vector of [2 x double]. The lower double-precision value is
691/// compared to the lower double-precision value of \a __b.
692/// \param __b
693/// A 128-bit vector of [2 x double]. The lower double-precision value is
694/// compared to the lower double-precision value of \a __a.
695/// \returns A 128-bit vector. The lower 64 bits contains the comparison
696/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
697static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
698 __m128d __b) {
699 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
700}
701
702/// Compares the lower double-precision floating-point values in each of
703/// the two 128-bit floating-point vectors of [2 x double] to determine if
704/// the value in the first parameter is less than the corresponding value in
705/// the second parameter.
706///
707/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
708/// If either value in a comparison is NaN, returns false.
709///
710/// \headerfile <x86intrin.h>
711///
712/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
713///
714/// \param __a
715/// A 128-bit vector of [2 x double]. The lower double-precision value is
716/// compared to the lower double-precision value of \a __b.
717/// \param __b
718/// A 128-bit vector of [2 x double]. The lower double-precision value is
719/// compared to the lower double-precision value of \a __a.
720/// \returns A 128-bit vector. The lower 64 bits contains the comparison
721/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
722static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
723 __m128d __b) {
724 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
725}
726
727/// Compares the lower double-precision floating-point values in each of
728/// the two 128-bit floating-point vectors of [2 x double] to determine if
729/// the value in the first parameter is less than or equal to the
730/// corresponding value in the second parameter.
731///
732/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
733/// If either value in a comparison is NaN, returns false.
734///
735/// \headerfile <x86intrin.h>
736///
737/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
738///
739/// \param __a
740/// A 128-bit vector of [2 x double]. The lower double-precision value is
741/// compared to the lower double-precision value of \a __b.
742/// \param __b
743/// A 128-bit vector of [2 x double]. The lower double-precision value is
744/// compared to the lower double-precision value of \a __a.
745/// \returns A 128-bit vector. The lower 64 bits contains the comparison
746/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
747static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
748 __m128d __b) {
749 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
750}
751
752/// Compares the lower double-precision floating-point values in each of
753/// the two 128-bit floating-point vectors of [2 x double] to determine if
754/// the value in the first parameter is greater than the corresponding value
755/// in the second parameter.
756///
757/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
758/// If either value in a comparison is NaN, returns false.
759///
760/// \headerfile <x86intrin.h>
761///
762/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
763///
764/// \param __a
765/// A 128-bit vector of [2 x double]. The lower double-precision value is
766/// compared to the lower double-precision value of \a __b.
767/// \param __b
768/// A 128-bit vector of [2 x double]. The lower double-precision value is
769/// compared to the lower double-precision value of \a __a.
770/// \returns A 128-bit vector. The lower 64 bits contains the comparison
771/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
772static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
773 __m128d __b) {
774 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
775 return __extension__(__m128d){__c[0], __a[1]};
776}
777
778/// Compares the lower double-precision floating-point values in each of
779/// the two 128-bit floating-point vectors of [2 x double] to determine if
780/// the value in the first parameter is greater than or equal to the
781/// corresponding value in the second parameter.
782///
783/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
784/// If either value in a comparison is NaN, returns false.
785///
786/// \headerfile <x86intrin.h>
787///
788/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
789///
790/// \param __a
791/// A 128-bit vector of [2 x double]. The lower double-precision value is
792/// compared to the lower double-precision value of \a __b.
793/// \param __b
794/// A 128-bit vector of [2 x double]. The lower double-precision value is
795/// compared to the lower double-precision value of \a __a.
796/// \returns A 128-bit vector. The lower 64 bits contains the comparison
797/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
798static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
799 __m128d __b) {
800 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
801 return __extension__(__m128d){__c[0], __a[1]};
802}
803
804/// Compares the lower double-precision floating-point values in each of
805/// the two 128-bit floating-point vectors of [2 x double] to determine if
806/// the value in the first parameter is ordered with respect to the
807/// corresponding value in the second parameter.
808///
809/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
810/// of double-precision values are ordered with respect to each other if
811/// neither value is a NaN.
812///
813/// \headerfile <x86intrin.h>
814///
815/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
816///
817/// \param __a
818/// A 128-bit vector of [2 x double]. The lower double-precision value is
819/// compared to the lower double-precision value of \a __b.
820/// \param __b
821/// A 128-bit vector of [2 x double]. The lower double-precision value is
822/// compared to the lower double-precision value of \a __a.
823/// \returns A 128-bit vector. The lower 64 bits contains the comparison
824/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
825static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
826 __m128d __b) {
827 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
828}
829
830/// Compares the lower double-precision floating-point values in each of
831/// the two 128-bit floating-point vectors of [2 x double] to determine if
832/// the value in the first parameter is unordered with respect to the
833/// corresponding value in the second parameter.
834///
835/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
836/// of double-precision values are unordered with respect to each other if
837/// one or both values are NaN.
838///
839/// \headerfile <x86intrin.h>
840///
841/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
842/// instruction.
843///
844/// \param __a
845/// A 128-bit vector of [2 x double]. The lower double-precision value is
846/// compared to the lower double-precision value of \a __b.
847/// \param __b
848/// A 128-bit vector of [2 x double]. The lower double-precision value is
849/// compared to the lower double-precision value of \a __a.
850/// \returns A 128-bit vector. The lower 64 bits contains the comparison
851/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
852static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
853 __m128d __b) {
854 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
855}
856
857/// Compares the lower double-precision floating-point values in each of
858/// the two 128-bit floating-point vectors of [2 x double] to determine if
859/// the value in the first parameter is unequal to the corresponding value in
860/// the second parameter.
861///
862/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
863/// If either value in a comparison is NaN, returns true.
864///
865/// \headerfile <x86intrin.h>
866///
867/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
868///
869/// \param __a
870/// A 128-bit vector of [2 x double]. The lower double-precision value is
871/// compared to the lower double-precision value of \a __b.
872/// \param __b
873/// A 128-bit vector of [2 x double]. The lower double-precision value is
874/// compared to the lower double-precision value of \a __a.
875/// \returns A 128-bit vector. The lower 64 bits contains the comparison
876/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
877static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
878 __m128d __b) {
879 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
880}
881
882/// Compares the lower double-precision floating-point values in each of
883/// the two 128-bit floating-point vectors of [2 x double] to determine if
884/// the value in the first parameter is not less than the corresponding
885/// value in the second parameter.
886///
887/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
888/// If either value in a comparison is NaN, returns true.
889///
890/// \headerfile <x86intrin.h>
891///
892/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
893///
894/// \param __a
895/// A 128-bit vector of [2 x double]. The lower double-precision value is
896/// compared to the lower double-precision value of \a __b.
897/// \param __b
898/// A 128-bit vector of [2 x double]. The lower double-precision value is
899/// compared to the lower double-precision value of \a __a.
900/// \returns A 128-bit vector. The lower 64 bits contains the comparison
901/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
902static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
903 __m128d __b) {
904 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
905}
906
907/// Compares the lower double-precision floating-point values in each of
908/// the two 128-bit floating-point vectors of [2 x double] to determine if
909/// the value in the first parameter is not less than or equal to the
910/// corresponding value in the second parameter.
911///
912/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
913/// If either value in a comparison is NaN, returns true.
914///
915/// \headerfile <x86intrin.h>
916///
917/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
918///
919/// \param __a
920/// A 128-bit vector of [2 x double]. The lower double-precision value is
921/// compared to the lower double-precision value of \a __b.
922/// \param __b
923/// A 128-bit vector of [2 x double]. The lower double-precision value is
924/// compared to the lower double-precision value of \a __a.
925/// \returns A 128-bit vector. The lower 64 bits contains the comparison
926/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
927static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
928 __m128d __b) {
929 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
930}
931
932/// Compares the lower double-precision floating-point values in each of
933/// the two 128-bit floating-point vectors of [2 x double] to determine if
934/// the value in the first parameter is not greater than the corresponding
935/// value in the second parameter.
936///
937/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
938/// If either value in a comparison is NaN, returns true.
939///
940/// \headerfile <x86intrin.h>
941///
942/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
943///
944/// \param __a
945/// A 128-bit vector of [2 x double]. The lower double-precision value is
946/// compared to the lower double-precision value of \a __b.
947/// \param __b
948/// A 128-bit vector of [2 x double]. The lower double-precision value is
949/// compared to the lower double-precision value of \a __a.
950/// \returns A 128-bit vector. The lower 64 bits contains the comparison
951/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
952static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
953 __m128d __b) {
954 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
955 return __extension__(__m128d){__c[0], __a[1]};
956}
957
958/// Compares the lower double-precision floating-point values in each of
959/// the two 128-bit floating-point vectors of [2 x double] to determine if
960/// the value in the first parameter is not greater than or equal to the
961/// corresponding value in the second parameter.
962///
963/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
964/// If either value in a comparison is NaN, returns true.
965///
966/// \headerfile <x86intrin.h>
967///
968/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
969///
970/// \param __a
971/// A 128-bit vector of [2 x double]. The lower double-precision value is
972/// compared to the lower double-precision value of \a __b.
973/// \param __b
974/// A 128-bit vector of [2 x double]. The lower double-precision value is
975/// compared to the lower double-precision value of \a __a.
976/// \returns A 128-bit vector. The lower 64 bits contains the comparison
977/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
978static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
979 __m128d __b) {
980 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
981 return __extension__(__m128d){__c[0], __a[1]};
982}
983
984/// Compares the lower double-precision floating-point values in each of
985/// the two 128-bit floating-point vectors of [2 x double] for equality.
986///
987/// The comparison returns 0 for false, 1 for true. If either value in a
988/// comparison is NaN, returns 0.
989///
990/// \headerfile <x86intrin.h>
991///
992/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
993///
994/// \param __a
995/// A 128-bit vector of [2 x double]. The lower double-precision value is
996/// compared to the lower double-precision value of \a __b.
997/// \param __b
998/// A 128-bit vector of [2 x double]. The lower double-precision value is
999/// compared to the lower double-precision value of \a __a.
1000/// \returns An integer containing the comparison results.
1001static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1002 __m128d __b) {
1003 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1004}
1005
1006/// Compares the lower double-precision floating-point values in each of
1007/// the two 128-bit floating-point vectors of [2 x double] to determine if
1008/// the value in the first parameter is less than the corresponding value in
1009/// the second parameter.
1010///
1011/// The comparison returns 0 for false, 1 for true. If either value in a
1012/// comparison is NaN, returns 0.
1013///
1014/// \headerfile <x86intrin.h>
1015///
1016/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1017///
1018/// \param __a
1019/// A 128-bit vector of [2 x double]. The lower double-precision value is
1020/// compared to the lower double-precision value of \a __b.
1021/// \param __b
1022/// A 128-bit vector of [2 x double]. The lower double-precision value is
1023/// compared to the lower double-precision value of \a __a.
1024/// \returns An integer containing the comparison results.
1025static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1026 __m128d __b) {
1027 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1028}
1029
1030/// Compares the lower double-precision floating-point values in each of
1031/// the two 128-bit floating-point vectors of [2 x double] to determine if
1032/// the value in the first parameter is less than or equal to the
1033/// corresponding value in the second parameter.
1034///
1035/// The comparison returns 0 for false, 1 for true. If either value in a
1036/// comparison is NaN, returns 0.
1037///
1038/// \headerfile <x86intrin.h>
1039///
1040/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1041///
1042/// \param __a
1043/// A 128-bit vector of [2 x double]. The lower double-precision value is
1044/// compared to the lower double-precision value of \a __b.
1045/// \param __b
1046/// A 128-bit vector of [2 x double]. The lower double-precision value is
1047/// compared to the lower double-precision value of \a __a.
1048/// \returns An integer containing the comparison results.
1049static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1050 __m128d __b) {
1051 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1052}
1053
1054/// Compares the lower double-precision floating-point values in each of
1055/// the two 128-bit floating-point vectors of [2 x double] to determine if
1056/// the value in the first parameter is greater than the corresponding value
1057/// in the second parameter.
1058///
1059/// The comparison returns 0 for false, 1 for true. If either value in a
1060/// comparison is NaN, returns 0.
1061///
1062/// \headerfile <x86intrin.h>
1063///
1064/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1065///
1066/// \param __a
1067/// A 128-bit vector of [2 x double]. The lower double-precision value is
1068/// compared to the lower double-precision value of \a __b.
1069/// \param __b
1070/// A 128-bit vector of [2 x double]. The lower double-precision value is
1071/// compared to the lower double-precision value of \a __a.
1072/// \returns An integer containing the comparison results.
1073static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1074 __m128d __b) {
1075 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1076}
1077
1078/// Compares the lower double-precision floating-point values in each of
1079/// the two 128-bit floating-point vectors of [2 x double] to determine if
1080/// the value in the first parameter is greater than or equal to the
1081/// corresponding value in the second parameter.
1082///
1083/// The comparison returns 0 for false, 1 for true. If either value in a
1084/// comparison is NaN, returns 0.
1085///
1086/// \headerfile <x86intrin.h>
1087///
1088/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1089///
1090/// \param __a
1091/// A 128-bit vector of [2 x double]. The lower double-precision value is
1092/// compared to the lower double-precision value of \a __b.
1093/// \param __b
1094/// A 128-bit vector of [2 x double]. The lower double-precision value is
1095/// compared to the lower double-precision value of \a __a.
1096/// \returns An integer containing the comparison results.
1097static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1098 __m128d __b) {
1099 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1100}
1101
1102/// Compares the lower double-precision floating-point values in each of
1103/// the two 128-bit floating-point vectors of [2 x double] to determine if
1104/// the value in the first parameter is unequal to the corresponding value in
1105/// the second parameter.
1106///
1107/// The comparison returns 0 for false, 1 for true. If either value in a
1108/// comparison is NaN, returns 1.
1109///
1110/// \headerfile <x86intrin.h>
1111///
1112/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1113///
1114/// \param __a
1115/// A 128-bit vector of [2 x double]. The lower double-precision value is
1116/// compared to the lower double-precision value of \a __b.
1117/// \param __b
1118/// A 128-bit vector of [2 x double]. The lower double-precision value is
1119/// compared to the lower double-precision value of \a __a.
1120/// \returns An integer containing the comparison results.
1121static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1122 __m128d __b) {
1123 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1124}
1125
1126/// Compares the lower double-precision floating-point values in each of
1127/// the two 128-bit floating-point vectors of [2 x double] for equality.
1128///
1129/// The comparison returns 0 for false, 1 for true. If either value in a
1130/// comparison is NaN, returns 0.
1131///
1132/// \headerfile <x86intrin.h>
1133///
1134/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1135///
1136/// \param __a
1137/// A 128-bit vector of [2 x double]. The lower double-precision value is
1138/// compared to the lower double-precision value of \a __b.
1139/// \param __b
1140/// A 128-bit vector of [2 x double]. The lower double-precision value is
1141/// compared to the lower double-precision value of \a __a.
1142/// \returns An integer containing the comparison results.
1143static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1144 __m128d __b) {
1145 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1146}
1147
1148/// Compares the lower double-precision floating-point values in each of
1149/// the two 128-bit floating-point vectors of [2 x double] to determine if
1150/// the value in the first parameter is less than the corresponding value in
1151/// the second parameter.
1152///
1153/// The comparison returns 0 for false, 1 for true. If either value in a
1154/// comparison is NaN, returns 0.
1155///
1156/// \headerfile <x86intrin.h>
1157///
1158/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1159///
1160/// \param __a
1161/// A 128-bit vector of [2 x double]. The lower double-precision value is
1162/// compared to the lower double-precision value of \a __b.
1163/// \param __b
1164/// A 128-bit vector of [2 x double]. The lower double-precision value is
1165/// compared to the lower double-precision value of \a __a.
1166/// \returns An integer containing the comparison results.
1167static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1168 __m128d __b) {
1169 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1170}
1171
1172/// Compares the lower double-precision floating-point values in each of
1173/// the two 128-bit floating-point vectors of [2 x double] to determine if
1174/// the value in the first parameter is less than or equal to the
1175/// corresponding value in the second parameter.
1176///
1177/// The comparison returns 0 for false, 1 for true. If either value in a
1178/// comparison is NaN, returns 0.
1179///
1180/// \headerfile <x86intrin.h>
1181///
1182/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1183///
1184/// \param __a
1185/// A 128-bit vector of [2 x double]. The lower double-precision value is
1186/// compared to the lower double-precision value of \a __b.
1187/// \param __b
1188/// A 128-bit vector of [2 x double]. The lower double-precision value is
1189/// compared to the lower double-precision value of \a __a.
1190/// \returns An integer containing the comparison results.
1191static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1192 __m128d __b) {
1193 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1194}
1195
1196/// Compares the lower double-precision floating-point values in each of
1197/// the two 128-bit floating-point vectors of [2 x double] to determine if
1198/// the value in the first parameter is greater than the corresponding value
1199/// in the second parameter.
1200///
1201/// The comparison returns 0 for false, 1 for true. If either value in a
1202/// comparison is NaN, returns 0.
1203///
1204/// \headerfile <x86intrin.h>
1205///
1206/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1207///
1208/// \param __a
1209/// A 128-bit vector of [2 x double]. The lower double-precision value is
1210/// compared to the lower double-precision value of \a __b.
1211/// \param __b
1212/// A 128-bit vector of [2 x double]. The lower double-precision value is
1213/// compared to the lower double-precision value of \a __a.
1214/// \returns An integer containing the comparison results.
1215static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1216 __m128d __b) {
1217 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1218}
1219
1220/// Compares the lower double-precision floating-point values in each of
1221/// the two 128-bit floating-point vectors of [2 x double] to determine if
1222/// the value in the first parameter is greater than or equal to the
1223/// corresponding value in the second parameter.
1224///
1225/// The comparison returns 0 for false, 1 for true. If either value in a
1226/// comparison is NaN, returns 0.
1227///
1228/// \headerfile <x86intrin.h>
1229///
1230/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231///
1232/// \param __a
1233/// A 128-bit vector of [2 x double]. The lower double-precision value is
1234/// compared to the lower double-precision value of \a __b.
1235/// \param __b
1236/// A 128-bit vector of [2 x double]. The lower double-precision value is
1237/// compared to the lower double-precision value of \a __a.
1238/// \returns An integer containing the comparison results.
1239static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1240 __m128d __b) {
1241 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1242}
1243
1244/// Compares the lower double-precision floating-point values in each of
1245/// the two 128-bit floating-point vectors of [2 x double] to determine if
1246/// the value in the first parameter is unequal to the corresponding value in
1247/// the second parameter.
1248///
1249/// The comparison returns 0 for false, 1 for true. If either value in a
1250/// comparison is NaN, returns 1.
1251///
1252/// \headerfile <x86intrin.h>
1253///
1254/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1255///
1256/// \param __a
1257/// A 128-bit vector of [2 x double]. The lower double-precision value is
1258/// compared to the lower double-precision value of \a __b.
1259/// \param __b
1260/// A 128-bit vector of [2 x double]. The lower double-precision value is
1261/// compared to the lower double-precision value of \a __a.
1262/// \returns An integer containing the comparison result.
1263static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1264 __m128d __b) {
1265 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1266}
1267
1268/// Converts the two double-precision floating-point elements of a
1269/// 128-bit vector of [2 x double] into two single-precision floating-point
1270/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1271/// The upper 64 bits of the result vector are set to zero.
1272///
1273/// \headerfile <x86intrin.h>
1274///
1275/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1276///
1277/// \param __a
1278/// A 128-bit vector of [2 x double].
1279/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1280/// converted values. The upper 64 bits are set to zero.
1281static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1283 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1284}
1285
1286/// Converts the lower two single-precision floating-point elements of a
1287/// 128-bit vector of [4 x float] into two double-precision floating-point
1288/// values, returned in a 128-bit vector of [2 x double]. The upper two
1289/// elements of the input vector are unused.
1290///
1291/// \headerfile <x86intrin.h>
1292///
1293/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1294///
1295/// \param __a
1296/// A 128-bit vector of [4 x float]. The lower two single-precision
1297/// floating-point elements are converted to double-precision values. The
1298/// upper two elements are unused.
1299/// \returns A 128-bit vector of [2 x double] containing the converted values.
1300static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1302 return (__m128d) __builtin_convertvector(
1303 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1304}
1305
1306/// Converts the lower two integer elements of a 128-bit vector of
1307/// [4 x i32] into two double-precision floating-point values, returned in a
1308/// 128-bit vector of [2 x double].
1309///
1310/// The upper two elements of the input vector are unused.
1311///
1312/// \headerfile <x86intrin.h>
1313///
1314/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1315///
1316/// \param __a
1317/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1318/// converted to double-precision values.
1319///
1320/// The upper two elements are unused.
1321/// \returns A 128-bit vector of [2 x double] containing the converted values.
1322static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1324 return (__m128d) __builtin_convertvector(
1325 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1326}
1327
1328/// Converts the two double-precision floating-point elements of a
1329/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1330/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1331/// 64 bits of the result vector are set to zero.
1332///
1333/// If a converted value does not fit in a 32-bit integer, raises a
1334/// floating-point invalid exception. If the exception is masked, returns
1335/// the most negative integer.
1336///
1337/// \headerfile <x86intrin.h>
1338///
1339/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1340///
1341/// \param __a
1342/// A 128-bit vector of [2 x double].
1343/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1344/// converted values. The upper 64 bits are set to zero.
1345static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1346 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1347}
1348
1349/// Converts the low-order element of a 128-bit vector of [2 x double]
1350/// into a 32-bit signed integer value.
1351///
1352/// If the converted value does not fit in a 32-bit integer, raises a
1353/// floating-point invalid exception. If the exception is masked, returns
1354/// the most negative integer.
1355///
1356/// \headerfile <x86intrin.h>
1357///
1358/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1359///
1360/// \param __a
1361/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1362/// conversion.
1363/// \returns A 32-bit signed integer containing the converted value.
1364static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1365 return __builtin_ia32_cvtsd2si((__v2df)__a);
1366}
1367
1368/// Converts the lower double-precision floating-point element of a
1369/// 128-bit vector of [2 x double], in the second parameter, into a
1370/// single-precision floating-point value, returned in the lower 32 bits of a
1371/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1372/// copied from the upper 96 bits of the first parameter.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1377///
1378/// \param __a
1379/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1380/// copied to the upper 96 bits of the result.
1381/// \param __b
1382/// A 128-bit vector of [2 x double]. The lower double-precision
1383/// floating-point element is used in the conversion.
1384/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1385/// converted value from the second parameter. The upper 96 bits are copied
1386/// from the upper 96 bits of the first parameter.
1387static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1388_mm_cvtsd_ss(__m128 __a, __m128d __b) {
1389 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1390}
1391
1392/// Converts a 32-bit signed integer value, in the second parameter, into
1393/// a double-precision floating-point value, returned in the lower 64 bits of
1394/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1395/// are copied from the upper 64 bits of the first parameter.
1396///
1397/// \headerfile <x86intrin.h>
1398///
1399/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1400///
1401/// \param __a
1402/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1403/// copied to the upper 64 bits of the result.
1404/// \param __b
1405/// A 32-bit signed integer containing the value to be converted.
1406/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1407/// converted value from the second parameter. The upper 64 bits are copied
1408/// from the upper 64 bits of the first parameter.
1409static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1410_mm_cvtsi32_sd(__m128d __a, int __b) {
1411 __a[0] = __b;
1412 return __a;
1413}
1414
1415/// Converts the lower single-precision floating-point element of a
1416/// 128-bit vector of [4 x float], in the second parameter, into a
1417/// double-precision floating-point value, returned in the lower 64 bits of
1418/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1419/// are copied from the upper 64 bits of the first parameter.
1420///
1421/// \headerfile <x86intrin.h>
1422///
1423/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1424///
1425/// \param __a
1426/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1427/// copied to the upper 64 bits of the result.
1428/// \param __b
1429/// A 128-bit vector of [4 x float]. The lower single-precision
1430/// floating-point element is used in the conversion.
1431/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1432/// converted value from the second parameter. The upper 64 bits are copied
1433/// from the upper 64 bits of the first parameter.
1434static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1435_mm_cvtss_sd(__m128d __a, __m128 __b) {
1436 __a[0] = __b[0];
1437 return __a;
1438}
1439
1440/// Converts the two double-precision floating-point elements of a
1441/// 128-bit vector of [2 x double] into two signed truncated (rounded
1442/// toward zero) 32-bit integer values, returned in the lower 64 bits
1443/// of a 128-bit vector of [4 x i32].
1444///
1445/// If a converted value does not fit in a 32-bit integer, raises a
1446/// floating-point invalid exception. If the exception is masked, returns
1447/// the most negative integer.
1448///
1449/// \headerfile <x86intrin.h>
1450///
1451/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1452/// instruction.
1453///
1454/// \param __a
1455/// A 128-bit vector of [2 x double].
1456/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1457/// converted values. The upper 64 bits are set to zero.
1458static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1459 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1460}
1461
1462/// Converts the low-order element of a [2 x double] vector into a 32-bit
1463/// signed truncated (rounded toward zero) integer value.
1464///
1465/// If the converted value does not fit in a 32-bit integer, raises a
1466/// floating-point invalid exception. If the exception is masked, returns
1467/// the most negative integer.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1472/// instruction.
1473///
1474/// \param __a
1475/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1476/// conversion.
1477/// \returns A 32-bit signed integer containing the converted value.
1478static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1479 return __builtin_ia32_cvttsd2si((__v2df)__a);
1480}
1481
1482/// Converts the two double-precision floating-point elements of a
1483/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1484/// returned in a 64-bit vector of [2 x i32].
1485///
1486/// If a converted value does not fit in a 32-bit integer, raises a
1487/// floating-point invalid exception. If the exception is masked, returns
1488/// the most negative integer.
1489///
1490/// \headerfile <x86intrin.h>
1491///
1492/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1493///
1494/// \param __a
1495/// A 128-bit vector of [2 x double].
1496/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1497static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1498 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1499}
1500
1501/// Converts the two double-precision floating-point elements of a
1502/// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1503/// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1504///
1505/// If a converted value does not fit in a 32-bit integer, raises a
1506/// floating-point invalid exception. If the exception is masked, returns
1507/// the most negative integer.
1508///
1509/// \headerfile <x86intrin.h>
1510///
1511/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1512///
1513/// \param __a
1514/// A 128-bit vector of [2 x double].
1515/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1516static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1517 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1518}
1519
1520/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1521/// [2 x i32] into two double-precision floating-point values, returned in a
1522/// 128-bit vector of [2 x double].
1523///
1524/// \headerfile <x86intrin.h>
1525///
1526/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1527///
1528/// \param __a
1529/// A 64-bit vector of [2 x i32].
1530/// \returns A 128-bit vector of [2 x double] containing the converted values.
1531static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1533 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1534}
1535
1536/// Returns the low-order element of a 128-bit vector of [2 x double] as
1537/// a double-precision floating-point value.
1538///
1539/// \headerfile <x86intrin.h>
1540///
1541/// This intrinsic has no corresponding instruction.
1542///
1543/// \param __a
1544/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1545/// \returns A double-precision floating-point value copied from the lower 64
1546/// bits of \a __a.
1547static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR
1549 return __a[0];
1550}
1551
1552/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1553/// memory location.
1554///
1555/// \headerfile <x86intrin.h>
1556///
1557/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1558///
1559/// \param __dp
1560/// A pointer to a 128-bit memory location. The address of the memory
1561/// location has to be 16-byte aligned.
1562/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1563static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1564 return *(const __m128d *)__dp;
1565}
1566
1567/// Loads a double-precision floating-point value from a specified memory
1568/// location and duplicates it to both vector elements of a 128-bit vector of
1569/// [2 x double].
1570///
1571/// \headerfile <x86intrin.h>
1572///
1573/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1574///
1575/// \param __dp
1576/// A pointer to a memory location containing a double-precision value.
1577/// \returns A 128-bit vector of [2 x double] containing the loaded and
1578/// duplicated values.
1579static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1580 struct __mm_load1_pd_struct {
1581 double __u;
1582 } __attribute__((__packed__, __may_alias__));
1583 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1584 return __extension__(__m128d){__u, __u};
1585}
1586
1587#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1588
1589/// Loads two double-precision values, in reverse order, from an aligned
1590/// memory location into a 128-bit vector of [2 x double].
1591///
1592/// \headerfile <x86intrin.h>
1593///
1594/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1595/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1596/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1597///
1598/// \param __dp
1599/// A 16-byte aligned pointer to an array of double-precision values to be
1600/// loaded in reverse order.
1601/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1602/// values.
1603static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1604 __m128d __u = *(const __m128d *)__dp;
1605 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1606}
1607
1608/// Loads a 128-bit floating-point vector of [2 x double] from an
1609/// unaligned memory location.
1610///
1611/// \headerfile <x86intrin.h>
1612///
1613/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1614///
1615/// \param __dp
1616/// A pointer to a 128-bit memory location. The address of the memory
1617/// location does not have to be aligned.
1618/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1619static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1620 struct __loadu_pd {
1621 __m128d_u __v;
1622 } __attribute__((__packed__, __may_alias__));
1623 return ((const struct __loadu_pd *)__dp)->__v;
1624}
1625
1626/// Loads a 64-bit integer value to the low element of a 128-bit integer
1627/// vector and clears the upper element.
1628///
1629/// \headerfile <x86intrin.h>
1630///
1631/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1632///
1633/// \param __a
1634/// A pointer to a 64-bit memory location. The address of the memory
1635/// location does not have to be aligned.
1636/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1637static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1638 struct __loadu_si64 {
1639 long long __v;
1640 } __attribute__((__packed__, __may_alias__));
1641 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1642 return __extension__(__m128i)(__v2di){__u, 0LL};
1643}
1644
1645/// Loads a 32-bit integer value to the low element of a 128-bit integer
1646/// vector and clears the upper element.
1647///
1648/// \headerfile <x86intrin.h>
1649///
1650/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1651///
1652/// \param __a
1653/// A pointer to a 32-bit memory location. The address of the memory
1654/// location does not have to be aligned.
1655/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1656static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1657 struct __loadu_si32 {
1658 int __v;
1659 } __attribute__((__packed__, __may_alias__));
1660 int __u = ((const struct __loadu_si32 *)__a)->__v;
1661 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1662}
1663
1664/// Loads a 16-bit integer value to the low element of a 128-bit integer
1665/// vector and clears the upper element.
1666///
1667/// \headerfile <x86intrin.h>
1668///
1669/// This intrinsic does not correspond to a specific instruction.
1670///
1671/// \param __a
1672/// A pointer to a 16-bit memory location. The address of the memory
1673/// location does not have to be aligned.
1674/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1675static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1676 struct __loadu_si16 {
1677 short __v;
1678 } __attribute__((__packed__, __may_alias__));
1679 short __u = ((const struct __loadu_si16 *)__a)->__v;
1680 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1681}
1682
1683/// Loads a 64-bit double-precision value to the low element of a
1684/// 128-bit integer vector and clears the upper element.
1685///
1686/// \headerfile <x86intrin.h>
1687///
1688/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1689///
1690/// \param __dp
1691/// A pointer to a memory location containing a double-precision value.
1692/// The address of the memory location does not have to be aligned.
1693/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1694static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1695 struct __mm_load_sd_struct {
1696 double __u;
1697 } __attribute__((__packed__, __may_alias__));
1698 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1699 return __extension__(__m128d){__u, 0};
1700}
1701
1702/// Loads a double-precision value into the high-order bits of a 128-bit
1703/// vector of [2 x double]. The low-order bits are copied from the low-order
1704/// bits of the first operand.
1705///
1706/// \headerfile <x86intrin.h>
1707///
1708/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1709///
1710/// \param __a
1711/// A 128-bit vector of [2 x double]. \n
1712/// Bits [63:0] are written to bits [63:0] of the result.
1713/// \param __dp
1714/// A pointer to a 64-bit memory location containing a double-precision
1715/// floating-point value that is loaded. The loaded value is written to bits
1716/// [127:64] of the result. The address of the memory location does not have
1717/// to be aligned.
1718/// \returns A 128-bit vector of [2 x double] containing the moved values.
1719static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1720 double const *__dp) {
1721 struct __mm_loadh_pd_struct {
1722 double __u;
1723 } __attribute__((__packed__, __may_alias__));
1724 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1725 return __extension__(__m128d){__a[0], __u};
1726}
1727
1728/// Loads a double-precision value into the low-order bits of a 128-bit
1729/// vector of [2 x double]. The high-order bits are copied from the
1730/// high-order bits of the first operand.
1731///
1732/// \headerfile <x86intrin.h>
1733///
1734/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1735///
1736/// \param __a
1737/// A 128-bit vector of [2 x double]. \n
1738/// Bits [127:64] are written to bits [127:64] of the result.
1739/// \param __dp
1740/// A pointer to a 64-bit memory location containing a double-precision
1741/// floating-point value that is loaded. The loaded value is written to bits
1742/// [63:0] of the result. The address of the memory location does not have to
1743/// be aligned.
1744/// \returns A 128-bit vector of [2 x double] containing the moved values.
1745static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1746 double const *__dp) {
1747 struct __mm_loadl_pd_struct {
1748 double __u;
1749 } __attribute__((__packed__, __may_alias__));
1750 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1751 return __extension__(__m128d){__u, __a[1]};
1752}
1753
1754/// Constructs a 128-bit floating-point vector of [2 x double] with
1755/// unspecified content. This could be used as an argument to another
1756/// intrinsic function where the argument is required but the value is not
1757/// actually used.
1758///
1759/// \headerfile <x86intrin.h>
1760///
1761/// This intrinsic has no corresponding instruction.
1762///
1763/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1764/// content.
1765static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1766 return (__m128d)__builtin_ia32_undef128();
1767}
1768
1769/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1770/// 64 bits of the vector are initialized with the specified double-precision
1771/// floating-point value. The upper 64 bits are set to zero.
1772///
1773/// \headerfile <x86intrin.h>
1774///
1775/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1776///
1777/// \param __w
1778/// A double-precision floating-point value used to initialize the lower 64
1779/// bits of the result.
1780/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1781/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1782/// set to zero.
1783static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) {
1784 return __extension__(__m128d){__w, 0.0};
1785}
1786
1787/// Constructs a 128-bit floating-point vector of [2 x double], with each
1788/// of the two double-precision floating-point vector elements set to the
1789/// specified double-precision floating-point value.
1790///
1791/// \headerfile <x86intrin.h>
1792///
1793/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1794///
1795/// \param __w
1796/// A double-precision floating-point value used to initialize each vector
1797/// element of the result.
1798/// \returns An initialized 128-bit floating-point vector of [2 x double].
1799static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) {
1800 return __extension__(__m128d){__w, __w};
1801}
1802
1803/// Constructs a 128-bit floating-point vector of [2 x double], with each
1804/// of the two double-precision floating-point vector elements set to the
1805/// specified double-precision floating-point value.
1806///
1807/// \headerfile <x86intrin.h>
1808///
1809/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1810///
1811/// \param __w
1812/// A double-precision floating-point value used to initialize each vector
1813/// element of the result.
1814/// \returns An initialized 128-bit floating-point vector of [2 x double].
1815static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) {
1816 return _mm_set1_pd(__w);
1817}
1818
1819/// Constructs a 128-bit floating-point vector of [2 x double]
1820/// initialized with the specified double-precision floating-point values.
1821///
1822/// \headerfile <x86intrin.h>
1823///
1824/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1825///
1826/// \param __w
1827/// A double-precision floating-point value used to initialize the upper 64
1828/// bits of the result.
1829/// \param __x
1830/// A double-precision floating-point value used to initialize the lower 64
1831/// bits of the result.
1832/// \returns An initialized 128-bit floating-point vector of [2 x double].
1833static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w,
1834 double __x) {
1835 return __extension__(__m128d){__x, __w};
1836}
1837
1838/// Constructs a 128-bit floating-point vector of [2 x double],
1839/// initialized in reverse order with the specified double-precision
1840/// floating-point values.
1841///
1842/// \headerfile <x86intrin.h>
1843///
1844/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1845///
1846/// \param __w
1847/// A double-precision floating-point value used to initialize the lower 64
1848/// bits of the result.
1849/// \param __x
1850/// A double-precision floating-point value used to initialize the upper 64
1851/// bits of the result.
1852/// \returns An initialized 128-bit floating-point vector of [2 x double].
1853static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w,
1854 double __x) {
1855 return __extension__(__m128d){__w, __x};
1856}
1857
1858/// Constructs a 128-bit floating-point vector of [2 x double]
1859/// initialized to zero.
1860///
1861/// \headerfile <x86intrin.h>
1862///
1863/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1864///
1865/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1866/// all elements set to zero.
1867static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) {
1868 return __extension__(__m128d){0.0, 0.0};
1869}
1870
1871/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1872/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1873/// 64 bits are set to the upper 64 bits of the first parameter.
1874///
1875/// \headerfile <x86intrin.h>
1876///
1877/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1878///
1879/// \param __a
1880/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1881/// upper 64 bits of the result.
1882/// \param __b
1883/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1884/// lower 64 bits of the result.
1885/// \returns A 128-bit vector of [2 x double] containing the moved values.
1886static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1887_mm_move_sd(__m128d __a, __m128d __b) {
1888 __a[0] = __b[0];
1889 return __a;
1890}
1891
1892/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1893/// memory location.
1894///
1895/// \headerfile <x86intrin.h>
1896///
1897/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1898///
1899/// \param __dp
1900/// A pointer to a 64-bit memory location.
1901/// \param __a
1902/// A 128-bit vector of [2 x double] containing the value to be stored.
1903static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1904 __m128d __a) {
1905 struct __mm_store_sd_struct {
1906 double __u;
1907 } __attribute__((__packed__, __may_alias__));
1908 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1909}
1910
1911/// Moves packed double-precision values from a 128-bit vector of
1912/// [2 x double] to a memory location.
1913///
1914/// \headerfile <x86intrin.h>
1915///
1916/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1917///
1918/// \param __dp
1919/// A pointer to an aligned memory location that can store two
1920/// double-precision values.
1921/// \param __a
1922/// A packed 128-bit vector of [2 x double] containing the values to be
1923/// moved.
1924static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1925 __m128d __a) {
1926 *(__m128d *)__dp = __a;
1927}
1928
1929/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1930/// the upper and lower 64 bits of a memory location.
1931///
1932/// \headerfile <x86intrin.h>
1933///
1934/// This intrinsic corresponds to the
1935/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1936///
1937/// \param __dp
1938/// A pointer to a memory location that can store two double-precision
1939/// values.
1940/// \param __a
1941/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1942/// of the values in \a __dp.
1943static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1944 __m128d __a) {
1945 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1946 _mm_store_pd(__dp, __a);
1947}
1948
1949/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1950/// the upper and lower 64 bits of a memory location.
1951///
1952/// \headerfile <x86intrin.h>
1953///
1954/// This intrinsic corresponds to the
1955/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1956///
1957/// \param __dp
1958/// A pointer to a memory location that can store two double-precision
1959/// values.
1960/// \param __a
1961/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1962/// of the values in \a __dp.
1963static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1964 __m128d __a) {
1965 _mm_store1_pd(__dp, __a);
1966}
1967
1968/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1969/// location.
1970///
1971/// \headerfile <x86intrin.h>
1972///
1973/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1974///
1975/// \param __dp
1976/// A pointer to a 128-bit memory location. The address of the memory
1977/// location does not have to be aligned.
1978/// \param __a
1979/// A 128-bit vector of [2 x double] containing the values to be stored.
1980static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1981 __m128d __a) {
1982 struct __storeu_pd {
1983 __m128d_u __v;
1984 } __attribute__((__packed__, __may_alias__));
1985 ((struct __storeu_pd *)__dp)->__v = __a;
1986}
1987
1988/// Stores two double-precision values, in reverse order, from a 128-bit
1989/// vector of [2 x double] to a 16-byte aligned memory location.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// This intrinsic corresponds to a shuffling instruction followed by a
1994/// <c> VMOVAPD / MOVAPD </c> instruction.
1995///
1996/// \param __dp
1997/// A pointer to a 16-byte aligned memory location that can store two
1998/// double-precision values.
1999/// \param __a
2000/// A 128-bit vector of [2 x double] containing the values to be reversed and
2001/// stored.
2002static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2003 __m128d __a) {
2004 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2005 *(__m128d *)__dp = __a;
2006}
2007
2008/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2009/// memory location.
2010///
2011/// \headerfile <x86intrin.h>
2012///
2013/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2014///
2015/// \param __dp
2016/// A pointer to a 64-bit memory location.
2017/// \param __a
2018/// A 128-bit vector of [2 x double] containing the value to be stored.
2019static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2020 __m128d __a) {
2021 struct __mm_storeh_pd_struct {
2022 double __u;
2023 } __attribute__((__packed__, __may_alias__));
2024 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2025}
2026
2027/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2028/// memory location.
2029///
2030/// \headerfile <x86intrin.h>
2031///
2032/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2033///
2034/// \param __dp
2035/// A pointer to a 64-bit memory location.
2036/// \param __a
2037/// A 128-bit vector of [2 x double] containing the value to be stored.
2038static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2039 __m128d __a) {
2040 struct __mm_storeh_pd_struct {
2041 double __u;
2042 } __attribute__((__packed__, __may_alias__));
2043 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2044}
2045
2046/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2047/// saving the lower 8 bits of each sum in the corresponding element of a
2048/// 128-bit result vector of [16 x i8].
2049///
2050/// The integer elements of both parameters can be either signed or unsigned.
2051///
2052/// \headerfile <x86intrin.h>
2053///
2054/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2055///
2056/// \param __a
2057/// A 128-bit vector of [16 x i8].
2058/// \param __b
2059/// A 128-bit vector of [16 x i8].
2060/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2061/// parameters.
2062static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2063_mm_add_epi8(__m128i __a, __m128i __b) {
2064 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2065}
2066
2067/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2068/// saving the lower 16 bits of each sum in the corresponding element of a
2069/// 128-bit result vector of [8 x i16].
2070///
2071/// The integer elements of both parameters can be either signed or unsigned.
2072///
2073/// \headerfile <x86intrin.h>
2074///
2075/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2076///
2077/// \param __a
2078/// A 128-bit vector of [8 x i16].
2079/// \param __b
2080/// A 128-bit vector of [8 x i16].
2081/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2082/// parameters.
2083static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2084_mm_add_epi16(__m128i __a, __m128i __b) {
2085 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2086}
2087
2088/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2089/// saving the lower 32 bits of each sum in the corresponding element of a
2090/// 128-bit result vector of [4 x i32].
2091///
2092/// The integer elements of both parameters can be either signed or unsigned.
2093///
2094/// \headerfile <x86intrin.h>
2095///
2096/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2097///
2098/// \param __a
2099/// A 128-bit vector of [4 x i32].
2100/// \param __b
2101/// A 128-bit vector of [4 x i32].
2102/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2103/// parameters.
2104static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2105_mm_add_epi32(__m128i __a, __m128i __b) {
2106 return (__m128i)((__v4su)__a + (__v4su)__b);
2107}
2108
2109/// Adds two signed or unsigned 64-bit integer values, returning the
2110/// lower 64 bits of the sum.
2111///
2112/// \headerfile <x86intrin.h>
2113///
2114/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2115///
2116/// \param __a
2117/// A 64-bit integer.
2118/// \param __b
2119/// A 64-bit integer.
2120/// \returns A 64-bit integer containing the sum of both parameters.
2121static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_si64(__m64 __a,
2122 __m64 __b) {
2123 return (__m64)(((__v1du)__a)[0] + ((__v1du)__b)[0]);
2124}
2125
2126/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2127/// saving the lower 64 bits of each sum in the corresponding element of a
2128/// 128-bit result vector of [2 x i64].
2129///
2130/// The integer elements of both parameters can be either signed or unsigned.
2131///
2132/// \headerfile <x86intrin.h>
2133///
2134/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2135///
2136/// \param __a
2137/// A 128-bit vector of [2 x i64].
2138/// \param __b
2139/// A 128-bit vector of [2 x i64].
2140/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2141/// parameters.
2142static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2143_mm_add_epi64(__m128i __a, __m128i __b) {
2144 return (__m128i)((__v2du)__a + (__v2du)__b);
2145}
2146
2147/// Adds, with saturation, the corresponding elements of two 128-bit
2148/// signed [16 x i8] vectors, saving each sum in the corresponding element
2149/// of a 128-bit result vector of [16 x i8].
2150///
2151/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2152/// less than 0x80 are saturated to 0x80.
2153///
2154/// \headerfile <x86intrin.h>
2155///
2156/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2157///
2158/// \param __a
2159/// A 128-bit signed [16 x i8] vector.
2160/// \param __b
2161/// A 128-bit signed [16 x i8] vector.
2162/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2163/// both parameters.
2164static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2165_mm_adds_epi8(__m128i __a, __m128i __b) {
2166 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2167}
2168
2169/// Adds, with saturation, the corresponding elements of two 128-bit
2170/// signed [8 x i16] vectors, saving each sum in the corresponding element
2171/// of a 128-bit result vector of [8 x i16].
2172///
2173/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2174/// less than 0x8000 are saturated to 0x8000.
2175///
2176/// \headerfile <x86intrin.h>
2177///
2178/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2179///
2180/// \param __a
2181/// A 128-bit signed [8 x i16] vector.
2182/// \param __b
2183/// A 128-bit signed [8 x i16] vector.
2184/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2185/// both parameters.
2186static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2187_mm_adds_epi16(__m128i __a, __m128i __b) {
2188 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2189}
2190
2191/// Adds, with saturation, the corresponding elements of two 128-bit
2192/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2193/// of a 128-bit result vector of [16 x i8].
2194///
2195/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2196/// saturated to 0x00.
2197///
2198/// \headerfile <x86intrin.h>
2199///
2200/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2201///
2202/// \param __a
2203/// A 128-bit unsigned [16 x i8] vector.
2204/// \param __b
2205/// A 128-bit unsigned [16 x i8] vector.
2206/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2207/// of both parameters.
2208static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2209_mm_adds_epu8(__m128i __a, __m128i __b) {
2210 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2211}
2212
2213/// Adds, with saturation, the corresponding elements of two 128-bit
2214/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2215/// of a 128-bit result vector of [8 x i16].
2216///
2217/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2218/// are saturated to 0x0000.
2219///
2220/// \headerfile <x86intrin.h>
2221///
2222/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2223///
2224/// \param __a
2225/// A 128-bit unsigned [8 x i16] vector.
2226/// \param __b
2227/// A 128-bit unsigned [8 x i16] vector.
2228/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2229/// of both parameters.
2230static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2231_mm_adds_epu16(__m128i __a, __m128i __b) {
2232 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2233}
2234
2235/// Computes the rounded averages of corresponding elements of two
2236/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2237/// corresponding element of a 128-bit result vector of [16 x i8].
2238///
2239/// \headerfile <x86intrin.h>
2240///
2241/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2242///
2243/// \param __a
2244/// A 128-bit unsigned [16 x i8] vector.
2245/// \param __b
2246/// A 128-bit unsigned [16 x i8] vector.
2247/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2248/// averages of both parameters.
2249static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2250_mm_avg_epu8(__m128i __a, __m128i __b) {
2251 return (__m128i)__builtin_ia32_pavgb128((__v16qu)__a, (__v16qu)__b);
2252}
2253
2254/// Computes the rounded averages of corresponding elements of two
2255/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2256/// corresponding element of a 128-bit result vector of [8 x i16].
2257///
2258/// \headerfile <x86intrin.h>
2259///
2260/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2261///
2262/// \param __a
2263/// A 128-bit unsigned [8 x i16] vector.
2264/// \param __b
2265/// A 128-bit unsigned [8 x i16] vector.
2266/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2267/// averages of both parameters.
2268static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2269_mm_avg_epu16(__m128i __a, __m128i __b) {
2270 return (__m128i)__builtin_ia32_pavgw128((__v8hu)__a, (__v8hu)__b);
2271}
2272
2273/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2274/// vectors, producing eight intermediate 32-bit signed integer products, and
2275/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2276/// [4 x i32] vector.
2277///
2278/// For example, bits [15:0] of both parameters are multiplied producing a
2279/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2280/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2281/// of the result.
2282///
2283/// \headerfile <x86intrin.h>
2284///
2285/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2286///
2287/// \param __a
2288/// A 128-bit signed [8 x i16] vector.
2289/// \param __b
2290/// A 128-bit signed [8 x i16] vector.
2291/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2292/// of both parameters.
2293static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2294_mm_madd_epi16(__m128i __a, __m128i __b) {
2295 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2296}
2297
2298/// Compares corresponding elements of two 128-bit signed [8 x i16]
2299/// vectors, saving the greater value from each comparison in the
2300/// corresponding element of a 128-bit result vector of [8 x i16].
2301///
2302/// \headerfile <x86intrin.h>
2303///
2304/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2305///
2306/// \param __a
2307/// A 128-bit signed [8 x i16] vector.
2308/// \param __b
2309/// A 128-bit signed [8 x i16] vector.
2310/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2311/// each comparison.
2312static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2313_mm_max_epi16(__m128i __a, __m128i __b) {
2314 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2315}
2316
2317/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2318/// vectors, saving the greater value from each comparison in the
2319/// corresponding element of a 128-bit result vector of [16 x i8].
2320///
2321/// \headerfile <x86intrin.h>
2322///
2323/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2324///
2325/// \param __a
2326/// A 128-bit unsigned [16 x i8] vector.
2327/// \param __b
2328/// A 128-bit unsigned [16 x i8] vector.
2329/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2330/// each comparison.
2331static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2332_mm_max_epu8(__m128i __a, __m128i __b) {
2333 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2334}
2335
2336/// Compares corresponding elements of two 128-bit signed [8 x i16]
2337/// vectors, saving the smaller value from each comparison in the
2338/// corresponding element of a 128-bit result vector of [8 x i16].
2339///
2340/// \headerfile <x86intrin.h>
2341///
2342/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2343///
2344/// \param __a
2345/// A 128-bit signed [8 x i16] vector.
2346/// \param __b
2347/// A 128-bit signed [8 x i16] vector.
2348/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2349/// each comparison.
2350static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2351_mm_min_epi16(__m128i __a, __m128i __b) {
2352 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2353}
2354
2355/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2356/// vectors, saving the smaller value from each comparison in the
2357/// corresponding element of a 128-bit result vector of [16 x i8].
2358///
2359/// \headerfile <x86intrin.h>
2360///
2361/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2362///
2363/// \param __a
2364/// A 128-bit unsigned [16 x i8] vector.
2365/// \param __b
2366/// A 128-bit unsigned [16 x i8] vector.
2367/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2368/// each comparison.
2369static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2370_mm_min_epu8(__m128i __a, __m128i __b) {
2371 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2372}
2373
2374/// Multiplies the corresponding elements of two signed [8 x i16]
2375/// vectors, saving the upper 16 bits of each 32-bit product in the
2376/// corresponding element of a 128-bit signed [8 x i16] result vector.
2377///
2378/// \headerfile <x86intrin.h>
2379///
2380/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2381///
2382/// \param __a
2383/// A 128-bit signed [8 x i16] vector.
2384/// \param __b
2385/// A 128-bit signed [8 x i16] vector.
2386/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2387/// each of the eight 32-bit products.
2388static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2389_mm_mulhi_epi16(__m128i __a, __m128i __b) {
2390 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2391}
2392
2393/// Multiplies the corresponding elements of two unsigned [8 x i16]
2394/// vectors, saving the upper 16 bits of each 32-bit product in the
2395/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2396///
2397/// \headerfile <x86intrin.h>
2398///
2399/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2400///
2401/// \param __a
2402/// A 128-bit unsigned [8 x i16] vector.
2403/// \param __b
2404/// A 128-bit unsigned [8 x i16] vector.
2405/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2406/// of each of the eight 32-bit products.
2407static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2408_mm_mulhi_epu16(__m128i __a, __m128i __b) {
2409 return (__m128i)__builtin_ia32_pmulhuw128((__v8hu)__a, (__v8hu)__b);
2410}
2411
2412/// Multiplies the corresponding elements of two signed [8 x i16]
2413/// vectors, saving the lower 16 bits of each 32-bit product in the
2414/// corresponding element of a 128-bit signed [8 x i16] result vector.
2415///
2416/// \headerfile <x86intrin.h>
2417///
2418/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2419///
2420/// \param __a
2421/// A 128-bit signed [8 x i16] vector.
2422/// \param __b
2423/// A 128-bit signed [8 x i16] vector.
2424/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2425/// each of the eight 32-bit products.
2426static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2427_mm_mullo_epi16(__m128i __a, __m128i __b) {
2428 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2429}
2430
2431/// Multiplies 32-bit unsigned integer values contained in the lower bits
2432/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2433/// product.
2434///
2435/// \headerfile <x86intrin.h>
2436///
2437/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2438///
2439/// \param __a
2440/// A 64-bit integer containing one of the source operands.
2441/// \param __b
2442/// A 64-bit integer containing one of the source operands.
2443/// \returns A 64-bit integer vector containing the product of both operands.
2444static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a,
2445 __m64 __b) {
2446 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a),
2447 (__v4si)__zext128(__b)));
2448}
2449
2450/// Multiplies 32-bit unsigned integer values contained in the lower
2451/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2452/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2453///
2454/// \headerfile <x86intrin.h>
2455///
2456/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2457///
2458/// \param __a
2459/// A [2 x i64] vector containing one of the source operands.
2460/// \param __b
2461/// A [2 x i64] vector containing one of the source operands.
2462/// \returns A [2 x i64] vector containing the product of both operands.
2463static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2464_mm_mul_epu32(__m128i __a, __m128i __b) {
2465 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2466}
2467
2468/// Computes the absolute differences of corresponding 8-bit integer
2469/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2470/// separately sums the second 8 absolute differences. Packs these two
2471/// unsigned 16-bit integer sums into the upper and lower elements of a
2472/// [2 x i64] vector.
2473///
2474/// \headerfile <x86intrin.h>
2475///
2476/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2477///
2478/// \param __a
2479/// A 128-bit integer vector containing one of the source operands.
2480/// \param __b
2481/// A 128-bit integer vector containing one of the source operands.
2482/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2483/// differences between both operands.
2484static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2485 __m128i __b) {
2486 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2487}
2488
2489/// Subtracts the corresponding 8-bit integer values in the operands.
2490///
2491/// \headerfile <x86intrin.h>
2492///
2493/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2494///
2495/// \param __a
2496/// A 128-bit integer vector containing the minuends.
2497/// \param __b
2498/// A 128-bit integer vector containing the subtrahends.
2499/// \returns A 128-bit integer vector containing the differences of the values
2500/// in the operands.
2501static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2502_mm_sub_epi8(__m128i __a, __m128i __b) {
2503 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2504}
2505
2506/// Subtracts the corresponding 16-bit integer values in the operands.
2507///
2508/// \headerfile <x86intrin.h>
2509///
2510/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2511///
2512/// \param __a
2513/// A 128-bit integer vector containing the minuends.
2514/// \param __b
2515/// A 128-bit integer vector containing the subtrahends.
2516/// \returns A 128-bit integer vector containing the differences of the values
2517/// in the operands.
2518static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2519_mm_sub_epi16(__m128i __a, __m128i __b) {
2520 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2521}
2522
2523/// Subtracts the corresponding 32-bit integer values in the operands.
2524///
2525/// \headerfile <x86intrin.h>
2526///
2527/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2528///
2529/// \param __a
2530/// A 128-bit integer vector containing the minuends.
2531/// \param __b
2532/// A 128-bit integer vector containing the subtrahends.
2533/// \returns A 128-bit integer vector containing the differences of the values
2534/// in the operands.
2535static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2536_mm_sub_epi32(__m128i __a, __m128i __b) {
2537 return (__m128i)((__v4su)__a - (__v4su)__b);
2538}
2539
2540/// Subtracts signed or unsigned 64-bit integer values and writes the
2541/// difference to the corresponding bits in the destination.
2542///
2543/// \headerfile <x86intrin.h>
2544///
2545/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2546///
2547/// \param __a
2548/// A 64-bit integer vector containing the minuend.
2549/// \param __b
2550/// A 64-bit integer vector containing the subtrahend.
2551/// \returns A 64-bit integer vector containing the difference of the values in
2552/// the operands.
2553static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_si64(__m64 __a,
2554 __m64 __b) {
2555 return (__m64)(((__v1du)__a)[0] - ((__v1du)__b)[0]);
2556}
2557
2558/// Subtracts the corresponding elements of two [2 x i64] vectors.
2559///
2560/// \headerfile <x86intrin.h>
2561///
2562/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2563///
2564/// \param __a
2565/// A 128-bit integer vector containing the minuends.
2566/// \param __b
2567/// A 128-bit integer vector containing the subtrahends.
2568/// \returns A 128-bit integer vector containing the differences of the values
2569/// in the operands.
2570static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2571_mm_sub_epi64(__m128i __a, __m128i __b) {
2572 return (__m128i)((__v2du)__a - (__v2du)__b);
2573}
2574
2575/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2576/// the input and returns the differences in the corresponding bytes in the
2577/// destination.
2578///
2579/// Differences greater than 0x7F are saturated to 0x7F, and differences
2580/// less than 0x80 are saturated to 0x80.
2581///
2582/// \headerfile <x86intrin.h>
2583///
2584/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2585///
2586/// \param __a
2587/// A 128-bit integer vector containing the minuends.
2588/// \param __b
2589/// A 128-bit integer vector containing the subtrahends.
2590/// \returns A 128-bit integer vector containing the differences of the values
2591/// in the operands.
2592static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2593_mm_subs_epi8(__m128i __a, __m128i __b) {
2594 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2595}
2596
2597/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2598/// the input and returns the differences in the corresponding bytes in the
2599/// destination.
2600///
2601/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2602/// than 0x8000 are saturated to 0x8000.
2603///
2604/// \headerfile <x86intrin.h>
2605///
2606/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2607///
2608/// \param __a
2609/// A 128-bit integer vector containing the minuends.
2610/// \param __b
2611/// A 128-bit integer vector containing the subtrahends.
2612/// \returns A 128-bit integer vector containing the differences of the values
2613/// in the operands.
2614static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2615_mm_subs_epi16(__m128i __a, __m128i __b) {
2616 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2617}
2618
2619/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2620/// the input and returns the differences in the corresponding bytes in the
2621/// destination.
2622///
2623/// Differences less than 0x00 are saturated to 0x00.
2624///
2625/// \headerfile <x86intrin.h>
2626///
2627/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2628///
2629/// \param __a
2630/// A 128-bit integer vector containing the minuends.
2631/// \param __b
2632/// A 128-bit integer vector containing the subtrahends.
2633/// \returns A 128-bit integer vector containing the unsigned integer
2634/// differences of the values in the operands.
2635static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2636_mm_subs_epu8(__m128i __a, __m128i __b) {
2637 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2638}
2639
2640/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2641/// the input and returns the differences in the corresponding bytes in the
2642/// destination.
2643///
2644/// Differences less than 0x0000 are saturated to 0x0000.
2645///
2646/// \headerfile <x86intrin.h>
2647///
2648/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2649///
2650/// \param __a
2651/// A 128-bit integer vector containing the minuends.
2652/// \param __b
2653/// A 128-bit integer vector containing the subtrahends.
2654/// \returns A 128-bit integer vector containing the unsigned integer
2655/// differences of the values in the operands.
2656static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2657_mm_subs_epu16(__m128i __a, __m128i __b) {
2658 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2659}
2660
2661/// Performs a bitwise AND of two 128-bit integer vectors.
2662///
2663/// \headerfile <x86intrin.h>
2664///
2665/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2666///
2667/// \param __a
2668/// A 128-bit integer vector containing one of the source operands.
2669/// \param __b
2670/// A 128-bit integer vector containing one of the source operands.
2671/// \returns A 128-bit integer vector containing the bitwise AND of the values
2672/// in both operands.
2673static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2674_mm_and_si128(__m128i __a, __m128i __b) {
2675 return (__m128i)((__v2du)__a & (__v2du)__b);
2676}
2677
2678/// Performs a bitwise AND of two 128-bit integer vectors, using the
2679/// one's complement of the values contained in the first source operand.
2680///
2681/// \headerfile <x86intrin.h>
2682///
2683/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2684///
2685/// \param __a
2686/// A 128-bit vector containing the left source operand. The one's complement
2687/// of this value is used in the bitwise AND.
2688/// \param __b
2689/// A 128-bit vector containing the right source operand.
2690/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2691/// complement of the first operand and the values in the second operand.
2692static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2693_mm_andnot_si128(__m128i __a, __m128i __b) {
2694 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2695}
2696/// Performs a bitwise OR of two 128-bit integer vectors.
2697///
2698/// \headerfile <x86intrin.h>
2699///
2700/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2701///
2702/// \param __a
2703/// A 128-bit integer vector containing one of the source operands.
2704/// \param __b
2705/// A 128-bit integer vector containing one of the source operands.
2706/// \returns A 128-bit integer vector containing the bitwise OR of the values
2707/// in both operands.
2708static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2709_mm_or_si128(__m128i __a, __m128i __b) {
2710 return (__m128i)((__v2du)__a | (__v2du)__b);
2711}
2712
2713/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2714///
2715/// \headerfile <x86intrin.h>
2716///
2717/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2718///
2719/// \param __a
2720/// A 128-bit integer vector containing one of the source operands.
2721/// \param __b
2722/// A 128-bit integer vector containing one of the source operands.
2723/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2724/// values in both operands.
2725static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2726_mm_xor_si128(__m128i __a, __m128i __b) {
2727 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2728}
2729
2730/// Left-shifts the 128-bit integer vector operand by the specified
2731/// number of bytes. Low-order bits are cleared.
2732///
2733/// \headerfile <x86intrin.h>
2734///
2735/// \code
2736/// __m128i _mm_slli_si128(__m128i a, const int imm);
2737/// \endcode
2738///
2739/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2740///
2741/// \param a
2742/// A 128-bit integer vector containing the source operand.
2743/// \param imm
2744/// An immediate value specifying the number of bytes to left-shift operand
2745/// \a a.
2746/// \returns A 128-bit integer vector containing the left-shifted value.
2747#define _mm_slli_si128(a, imm) \
2748 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
2749 (int)(imm)))
2750
2751#define _mm_bslli_si128(a, imm) \
2752 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
2753 (int)(imm)))
2754
2755/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2756/// by the specified number of bits. Low-order bits are cleared.
2757///
2758/// \headerfile <x86intrin.h>
2759///
2760/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2761///
2762/// \param __a
2763/// A 128-bit integer vector containing the source operand.
2764/// \param __count
2765/// An integer value specifying the number of bits to left-shift each value
2766/// in operand \a __a.
2767/// \returns A 128-bit integer vector containing the left-shifted values.
2768static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2769_mm_slli_epi16(__m128i __a, int __count) {
2770 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2771}
2772
2773/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2774/// by the specified number of bits. Low-order bits are cleared.
2775///
2776/// \headerfile <x86intrin.h>
2777///
2778/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2779///
2780/// \param __a
2781/// A 128-bit integer vector containing the source operand.
2782/// \param __count
2783/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2784/// to left-shift each value in operand \a __a.
2785/// \returns A 128-bit integer vector containing the left-shifted values.
2786static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2787_mm_sll_epi16(__m128i __a, __m128i __count) {
2788 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2789}
2790
2791/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2792/// by the specified number of bits. Low-order bits are cleared.
2793///
2794/// \headerfile <x86intrin.h>
2795///
2796/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2797///
2798/// \param __a
2799/// A 128-bit integer vector containing the source operand.
2800/// \param __count
2801/// An integer value specifying the number of bits to left-shift each value
2802/// in operand \a __a.
2803/// \returns A 128-bit integer vector containing the left-shifted values.
2804static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2805_mm_slli_epi32(__m128i __a, int __count) {
2806 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2807}
2808
2809/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2810/// by the specified number of bits. Low-order bits are cleared.
2811///
2812/// \headerfile <x86intrin.h>
2813///
2814/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2815///
2816/// \param __a
2817/// A 128-bit integer vector containing the source operand.
2818/// \param __count
2819/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2820/// to left-shift each value in operand \a __a.
2821/// \returns A 128-bit integer vector containing the left-shifted values.
2822static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2823_mm_sll_epi32(__m128i __a, __m128i __count) {
2824 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2825}
2826
2827/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2828/// by the specified number of bits. Low-order bits are cleared.
2829///
2830/// \headerfile <x86intrin.h>
2831///
2832/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2833///
2834/// \param __a
2835/// A 128-bit integer vector containing the source operand.
2836/// \param __count
2837/// An integer value specifying the number of bits to left-shift each value
2838/// in operand \a __a.
2839/// \returns A 128-bit integer vector containing the left-shifted values.
2840static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2841_mm_slli_epi64(__m128i __a, int __count) {
2842 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2843}
2844
2845/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2846/// by the specified number of bits. Low-order bits are cleared.
2847///
2848/// \headerfile <x86intrin.h>
2849///
2850/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2851///
2852/// \param __a
2853/// A 128-bit integer vector containing the source operand.
2854/// \param __count
2855/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2856/// to left-shift each value in operand \a __a.
2857/// \returns A 128-bit integer vector containing the left-shifted values.
2858static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2859_mm_sll_epi64(__m128i __a, __m128i __count) {
2860 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2861}
2862
2863/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2864/// by the specified number of bits. High-order bits are filled with the sign
2865/// bit of the initial value.
2866///
2867/// \headerfile <x86intrin.h>
2868///
2869/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2870///
2871/// \param __a
2872/// A 128-bit integer vector containing the source operand.
2873/// \param __count
2874/// An integer value specifying the number of bits to right-shift each value
2875/// in operand \a __a.
2876/// \returns A 128-bit integer vector containing the right-shifted values.
2877static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2878_mm_srai_epi16(__m128i __a, int __count) {
2879 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2880}
2881
2882/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2883/// by the specified number of bits. High-order bits are filled with the sign
2884/// bit of the initial value.
2885///
2886/// \headerfile <x86intrin.h>
2887///
2888/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2889///
2890/// \param __a
2891/// A 128-bit integer vector containing the source operand.
2892/// \param __count
2893/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2894/// to right-shift each value in operand \a __a.
2895/// \returns A 128-bit integer vector containing the right-shifted values.
2896static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2897_mm_sra_epi16(__m128i __a, __m128i __count) {
2898 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2899}
2900
2901/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2902/// by the specified number of bits. High-order bits are filled with the sign
2903/// bit of the initial value.
2904///
2905/// \headerfile <x86intrin.h>
2906///
2907/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2908///
2909/// \param __a
2910/// A 128-bit integer vector containing the source operand.
2911/// \param __count
2912/// An integer value specifying the number of bits to right-shift each value
2913/// in operand \a __a.
2914/// \returns A 128-bit integer vector containing the right-shifted values.
2915static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2916_mm_srai_epi32(__m128i __a, int __count) {
2917 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2918}
2919
2920/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2921/// by the specified number of bits. High-order bits are filled with the sign
2922/// bit of the initial value.
2923///
2924/// \headerfile <x86intrin.h>
2925///
2926/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2927///
2928/// \param __a
2929/// A 128-bit integer vector containing the source operand.
2930/// \param __count
2931/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2932/// to right-shift each value in operand \a __a.
2933/// \returns A 128-bit integer vector containing the right-shifted values.
2934static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2935_mm_sra_epi32(__m128i __a, __m128i __count) {
2936 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2937}
2938
2939/// Right-shifts the 128-bit integer vector operand by the specified
2940/// number of bytes. High-order bits are cleared.
2941///
2942/// \headerfile <x86intrin.h>
2943///
2944/// \code
2945/// __m128i _mm_srli_si128(__m128i a, const int imm);
2946/// \endcode
2947///
2948/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2949///
2950/// \param a
2951/// A 128-bit integer vector containing the source operand.
2952/// \param imm
2953/// An immediate value specifying the number of bytes to right-shift operand
2954/// \a a.
2955/// \returns A 128-bit integer vector containing the right-shifted value.
2956#define _mm_srli_si128(a, imm) \
2957 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
2958 (int)(imm)))
2959
2960#define _mm_bsrli_si128(a, imm) \
2961 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
2962 (int)(imm)))
2963
2964/// Right-shifts each of 16-bit values in the 128-bit integer vector
2965/// operand by the specified number of bits. High-order bits are cleared.
2966///
2967/// \headerfile <x86intrin.h>
2968///
2969/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2970///
2971/// \param __a
2972/// A 128-bit integer vector containing the source operand.
2973/// \param __count
2974/// An integer value specifying the number of bits to right-shift each value
2975/// in operand \a __a.
2976/// \returns A 128-bit integer vector containing the right-shifted values.
2977static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2978_mm_srli_epi16(__m128i __a, int __count) {
2979 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2980}
2981
2982/// Right-shifts each of 16-bit values in the 128-bit integer vector
2983/// operand by the specified number of bits. High-order bits are cleared.
2984///
2985/// \headerfile <x86intrin.h>
2986///
2987/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2988///
2989/// \param __a
2990/// A 128-bit integer vector containing the source operand.
2991/// \param __count
2992/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2993/// to right-shift each value in operand \a __a.
2994/// \returns A 128-bit integer vector containing the right-shifted values.
2995static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2996_mm_srl_epi16(__m128i __a, __m128i __count) {
2997 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2998}
2999
3000/// Right-shifts each of 32-bit values in the 128-bit integer vector
3001/// operand by the specified number of bits. High-order bits are cleared.
3002///
3003/// \headerfile <x86intrin.h>
3004///
3005/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3006///
3007/// \param __a
3008/// A 128-bit integer vector containing the source operand.
3009/// \param __count
3010/// An integer value specifying the number of bits to right-shift each value
3011/// in operand \a __a.
3012/// \returns A 128-bit integer vector containing the right-shifted values.
3013static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3014_mm_srli_epi32(__m128i __a, int __count) {
3015 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3016}
3017
3018/// Right-shifts each of 32-bit values in the 128-bit integer vector
3019/// operand by the specified number of bits. High-order bits are cleared.
3020///
3021/// \headerfile <x86intrin.h>
3022///
3023/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3024///
3025/// \param __a
3026/// A 128-bit integer vector containing the source operand.
3027/// \param __count
3028/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3029/// to right-shift each value in operand \a __a.
3030/// \returns A 128-bit integer vector containing the right-shifted values.
3031static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3032_mm_srl_epi32(__m128i __a, __m128i __count) {
3033 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3034}
3035
3036/// Right-shifts each of 64-bit values in the 128-bit integer vector
3037/// operand by the specified number of bits. High-order bits are cleared.
3038///
3039/// \headerfile <x86intrin.h>
3040///
3041/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3042///
3043/// \param __a
3044/// A 128-bit integer vector containing the source operand.
3045/// \param __count
3046/// An integer value specifying the number of bits to right-shift each value
3047/// in operand \a __a.
3048/// \returns A 128-bit integer vector containing the right-shifted values.
3049static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3050_mm_srli_epi64(__m128i __a, int __count) {
3051 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3052}
3053
3054/// Right-shifts each of 64-bit values in the 128-bit integer vector
3055/// operand by the specified number of bits. High-order bits are cleared.
3056///
3057/// \headerfile <x86intrin.h>
3058///
3059/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3060///
3061/// \param __a
3062/// A 128-bit integer vector containing the source operand.
3063/// \param __count
3064/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3065/// to right-shift each value in operand \a __a.
3066/// \returns A 128-bit integer vector containing the right-shifted values.
3067static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3068_mm_srl_epi64(__m128i __a, __m128i __count) {
3069 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3070}
3071
3072/// Compares each of the corresponding 8-bit values of the 128-bit
3073/// integer vectors for equality.
3074///
3075/// Each comparison returns 0x0 for false, 0xFF for true.
3076///
3077/// \headerfile <x86intrin.h>
3078///
3079/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3080///
3081/// \param __a
3082/// A 128-bit integer vector.
3083/// \param __b
3084/// A 128-bit integer vector.
3085/// \returns A 128-bit integer vector containing the comparison results.
3086static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3087_mm_cmpeq_epi8(__m128i __a, __m128i __b) {
3088 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3089}
3090
3091/// Compares each of the corresponding 16-bit values of the 128-bit
3092/// integer vectors for equality.
3093///
3094/// Each comparison returns 0x0 for false, 0xFFFF for true.
3095///
3096/// \headerfile <x86intrin.h>
3097///
3098/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3099///
3100/// \param __a
3101/// A 128-bit integer vector.
3102/// \param __b
3103/// A 128-bit integer vector.
3104/// \returns A 128-bit integer vector containing the comparison results.
3105static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3106_mm_cmpeq_epi16(__m128i __a, __m128i __b) {
3107 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3108}
3109
3110/// Compares each of the corresponding 32-bit values of the 128-bit
3111/// integer vectors for equality.
3112///
3113/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3114///
3115/// \headerfile <x86intrin.h>
3116///
3117/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3118///
3119/// \param __a
3120/// A 128-bit integer vector.
3121/// \param __b
3122/// A 128-bit integer vector.
3123/// \returns A 128-bit integer vector containing the comparison results.
3124static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3125_mm_cmpeq_epi32(__m128i __a, __m128i __b) {
3126 return (__m128i)((__v4si)__a == (__v4si)__b);
3127}
3128
3129/// Compares each of the corresponding signed 8-bit values of the 128-bit
3130/// integer vectors to determine if the values in the first operand are
3131/// greater than those in the second operand.
3132///
3133/// Each comparison returns 0x0 for false, 0xFF for true.
3134///
3135/// \headerfile <x86intrin.h>
3136///
3137/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3138///
3139/// \param __a
3140/// A 128-bit integer vector.
3141/// \param __b
3142/// A 128-bit integer vector.
3143/// \returns A 128-bit integer vector containing the comparison results.
3144static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3145_mm_cmpgt_epi8(__m128i __a, __m128i __b) {
3146 /* This function always performs a signed comparison, but __v16qi is a char
3147 which may be signed or unsigned, so use __v16qs. */
3148 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3149}
3150
3151/// Compares each of the corresponding signed 16-bit values of the
3152/// 128-bit integer vectors to determine if the values in the first operand
3153/// are greater than those in the second operand.
3154///
3155/// Each comparison returns 0x0 for false, 0xFFFF for true.
3156///
3157/// \headerfile <x86intrin.h>
3158///
3159/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3160///
3161/// \param __a
3162/// A 128-bit integer vector.
3163/// \param __b
3164/// A 128-bit integer vector.
3165/// \returns A 128-bit integer vector containing the comparison results.
3166static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3167_mm_cmpgt_epi16(__m128i __a, __m128i __b) {
3168 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3169}
3170
3171/// Compares each of the corresponding signed 32-bit values of the
3172/// 128-bit integer vectors to determine if the values in the first operand
3173/// are greater than those in the second operand.
3174///
3175/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3176///
3177/// \headerfile <x86intrin.h>
3178///
3179/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3180///
3181/// \param __a
3182/// A 128-bit integer vector.
3183/// \param __b
3184/// A 128-bit integer vector.
3185/// \returns A 128-bit integer vector containing the comparison results.
3186static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3187_mm_cmpgt_epi32(__m128i __a, __m128i __b) {
3188 return (__m128i)((__v4si)__a > (__v4si)__b);
3189}
3190
3191/// Compares each of the corresponding signed 8-bit values of the 128-bit
3192/// integer vectors to determine if the values in the first operand are less
3193/// than those in the second operand.
3194///
3195/// Each comparison returns 0x0 for false, 0xFF for true.
3196///
3197/// \headerfile <x86intrin.h>
3198///
3199/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3200///
3201/// \param __a
3202/// A 128-bit integer vector.
3203/// \param __b
3204/// A 128-bit integer vector.
3205/// \returns A 128-bit integer vector containing the comparison results.
3206static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3207_mm_cmplt_epi8(__m128i __a, __m128i __b) {
3208 return _mm_cmpgt_epi8(__b, __a);
3209}
3210
3211/// Compares each of the corresponding signed 16-bit values of the
3212/// 128-bit integer vectors to determine if the values in the first operand
3213/// are less than those in the second operand.
3214///
3215/// Each comparison returns 0x0 for false, 0xFFFF for true.
3216///
3217/// \headerfile <x86intrin.h>
3218///
3219/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3220///
3221/// \param __a
3222/// A 128-bit integer vector.
3223/// \param __b
3224/// A 128-bit integer vector.
3225/// \returns A 128-bit integer vector containing the comparison results.
3226static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3227_mm_cmplt_epi16(__m128i __a, __m128i __b) {
3228 return _mm_cmpgt_epi16(__b, __a);
3229}
3230
3231/// Compares each of the corresponding signed 32-bit values of the
3232/// 128-bit integer vectors to determine if the values in the first operand
3233/// are less than those in the second operand.
3234///
3235/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3236///
3237/// \headerfile <x86intrin.h>
3238///
3239/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3240///
3241/// \param __a
3242/// A 128-bit integer vector.
3243/// \param __b
3244/// A 128-bit integer vector.
3245/// \returns A 128-bit integer vector containing the comparison results.
3246static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3247_mm_cmplt_epi32(__m128i __a, __m128i __b) {
3248 return _mm_cmpgt_epi32(__b, __a);
3249}
3250
3251#ifdef __x86_64__
3252/// Converts a 64-bit signed integer value from the second operand into a
3253/// double-precision value and returns it in the lower element of a [2 x
3254/// double] vector; the upper element of the returned vector is copied from
3255/// the upper element of the first operand.
3256///
3257/// \headerfile <x86intrin.h>
3258///
3259/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3260///
3261/// \param __a
3262/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3263/// copied to the upper 64 bits of the destination.
3264/// \param __b
3265/// A 64-bit signed integer operand containing the value to be converted.
3266/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3267/// converted value of the second operand. The upper 64 bits are copied from
3268/// the upper 64 bits of the first operand.
3269static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
3270_mm_cvtsi64_sd(__m128d __a, long long __b) {
3271 __a[0] = __b;
3272 return __a;
3273}
3274
3275/// Converts the first (lower) element of a vector of [2 x double] into a
3276/// 64-bit signed integer value.
3277///
3278/// If the converted value does not fit in a 64-bit integer, raises a
3279/// floating-point invalid exception. If the exception is masked, returns
3280/// the most negative integer.
3281///
3282/// \headerfile <x86intrin.h>
3283///
3284/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3285///
3286/// \param __a
3287/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3288/// conversion.
3289/// \returns A 64-bit signed integer containing the converted value.
3290static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3291 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3292}
3293
3294/// Converts the first (lower) element of a vector of [2 x double] into a
3295/// 64-bit signed truncated (rounded toward zero) integer value.
3296///
3297/// If a converted value does not fit in a 64-bit integer, raises a
3298/// floating-point invalid exception. If the exception is masked, returns
3299/// the most negative integer.
3300///
3301/// \headerfile <x86intrin.h>
3302///
3303/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3304/// instruction.
3305///
3306/// \param __a
3307/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3308/// conversion.
3309/// \returns A 64-bit signed integer containing the converted value.
3310static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3311 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3312}
3313#endif
3314
3315/// Converts a vector of [4 x i32] into a vector of [4 x float].
3316///
3317/// \headerfile <x86intrin.h>
3318///
3319/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3320///
3321/// \param __a
3322/// A 128-bit integer vector.
3323/// \returns A 128-bit vector of [4 x float] containing the converted values.
3324static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
3326 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3327}
3328
3329/// Converts a vector of [4 x float] into a vector of [4 x i32].
3330///
3331/// If a converted value does not fit in a 32-bit integer, raises a
3332/// floating-point invalid exception. If the exception is masked, returns
3333/// the most negative integer.
3334///
3335/// \headerfile <x86intrin.h>
3336///
3337/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3338///
3339/// \param __a
3340/// A 128-bit vector of [4 x float].
3341/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3342/// values.
3343static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3344 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3345}
3346
3347/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3348/// zero) 32-bit integers, returned in a vector of [4 x i32].
3349///
3350/// If a converted value does not fit in a 32-bit integer, raises a
3351/// floating-point invalid exception. If the exception is masked, returns
3352/// the most negative integer.
3353///
3354/// \headerfile <x86intrin.h>
3355///
3356/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3357/// instruction.
3358///
3359/// \param __a
3360/// A 128-bit vector of [4 x float].
3361/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3362static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3363 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3364}
3365
3366/// Returns a vector of [4 x i32] where the lowest element is the input
3367/// operand and the remaining elements are zero.
3368///
3369/// \headerfile <x86intrin.h>
3370///
3371/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3372///
3373/// \param __a
3374/// A 32-bit signed integer operand.
3375/// \returns A 128-bit vector of [4 x i32].
3376static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3378 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3379}
3380
3381/// Returns a vector of [2 x i64] where the lower element is the input
3382/// operand and the upper element is zero.
3383///
3384/// \headerfile <x86intrin.h>
3385///
3386/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3387/// in 64-bit mode.
3388///
3389/// \param __a
3390/// A 64-bit signed integer operand containing the value to be converted.
3391/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3392static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3394 return __extension__(__m128i)(__v2di){__a, 0};
3395}
3396
3397/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3398/// 32-bit signed integer value.
3399///
3400/// \headerfile <x86intrin.h>
3401///
3402/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3403///
3404/// \param __a
3405/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3406/// destination.
3407/// \returns A 32-bit signed integer containing the moved value.
3408static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
3410 __v4si __b = (__v4si)__a;
3411 return __b[0];
3412}
3413
3414/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3415/// 64-bit signed integer value.
3416///
3417/// \headerfile <x86intrin.h>
3418///
3419/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3420///
3421/// \param __a
3422/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3423/// destination.
3424/// \returns A 64-bit signed integer containing the moved value.
3425static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
3427 return __a[0];
3428}
3429
3430/// Moves packed integer values from an aligned 128-bit memory location
3431/// to elements in a 128-bit integer vector.
3432///
3433/// \headerfile <x86intrin.h>
3434///
3435/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3436///
3437/// \param __p
3438/// An aligned pointer to a memory location containing integer values.
3439/// \returns A 128-bit integer vector containing the moved values.
3440static __inline__ __m128i __DEFAULT_FN_ATTRS
3441_mm_load_si128(__m128i const *__p) {
3442 return *__p;
3443}
3444
3445/// Moves packed integer values from an unaligned 128-bit memory location
3446/// to elements in a 128-bit integer vector.
3447///
3448/// \headerfile <x86intrin.h>
3449///
3450/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3451///
3452/// \param __p
3453/// A pointer to a memory location containing integer values.
3454/// \returns A 128-bit integer vector containing the moved values.
3455static __inline__ __m128i __DEFAULT_FN_ATTRS
3456_mm_loadu_si128(__m128i_u const *__p) {
3457 struct __loadu_si128 {
3458 __m128i_u __v;
3459 } __attribute__((__packed__, __may_alias__));
3460 return ((const struct __loadu_si128 *)__p)->__v;
3461}
3462
3463/// Returns a vector of [2 x i64] where the lower element is taken from
3464/// the lower element of the operand, and the upper element is zero.
3465///
3466/// \headerfile <x86intrin.h>
3467///
3468/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3469///
3470/// \param __p
3471/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3472/// the destination.
3473/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3474/// moved value. The higher order bits are cleared.
3475static __inline__ __m128i __DEFAULT_FN_ATTRS
3476_mm_loadl_epi64(__m128i_u const *__p) {
3477 struct __mm_loadl_epi64_struct {
3478 long long __u;
3479 } __attribute__((__packed__, __may_alias__));
3480 return __extension__(__m128i){
3481 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3482}
3483
3484/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3485/// This could be used as an argument to another intrinsic function where the
3486/// argument is required but the value is not actually used.
3487///
3488/// \headerfile <x86intrin.h>
3489///
3490/// This intrinsic has no corresponding instruction.
3491///
3492/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3493static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3494 return (__m128i)__builtin_ia32_undef128();
3495}
3496
3497/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3498/// the specified 64-bit integer values.
3499///
3500/// \headerfile <x86intrin.h>
3501///
3502/// This intrinsic is a utility function and does not correspond to a specific
3503/// instruction.
3504///
3505/// \param __q1
3506/// A 64-bit integer value used to initialize the upper 64 bits of the
3507/// destination vector of [2 x i64].
3508/// \param __q0
3509/// A 64-bit integer value used to initialize the lower 64 bits of the
3510/// destination vector of [2 x i64].
3511/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3512/// provided in the operands.
3513static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3514_mm_set_epi64x(long long __q1, long long __q0) {
3515 return __extension__(__m128i)(__v2di){__q0, __q1};
3516}
3517
3518/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3519/// the specified 64-bit integer values.
3520///
3521/// \headerfile <x86intrin.h>
3522///
3523/// This intrinsic is a utility function and does not correspond to a specific
3524/// instruction.
3525///
3526/// \param __q1
3527/// A 64-bit integer value used to initialize the upper 64 bits of the
3528/// destination vector of [2 x i64].
3529/// \param __q0
3530/// A 64-bit integer value used to initialize the lower 64 bits of the
3531/// destination vector of [2 x i64].
3532/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3533/// provided in the operands.
3534static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3535_mm_set_epi64(__m64 __q1, __m64 __q0) {
3536 return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]);
3537}
3538
3539/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3540/// the specified 32-bit integer values.
3541///
3542/// \headerfile <x86intrin.h>
3543///
3544/// This intrinsic is a utility function and does not correspond to a specific
3545/// instruction.
3546///
3547/// \param __i3
3548/// A 32-bit integer value used to initialize bits [127:96] of the
3549/// destination vector.
3550/// \param __i2
3551/// A 32-bit integer value used to initialize bits [95:64] of the destination
3552/// vector.
3553/// \param __i1
3554/// A 32-bit integer value used to initialize bits [63:32] of the destination
3555/// vector.
3556/// \param __i0
3557/// A 32-bit integer value used to initialize bits [31:0] of the destination
3558/// vector.
3559/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3560/// provided in the operands.
3561static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3,
3562 int __i2,
3563 int __i1,
3564 int __i0) {
3565 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3566}
3567
3568/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3569/// the specified 16-bit integer values.
3570///
3571/// \headerfile <x86intrin.h>
3572///
3573/// This intrinsic is a utility function and does not correspond to a specific
3574/// instruction.
3575///
3576/// \param __w7
3577/// A 16-bit integer value used to initialize bits [127:112] of the
3578/// destination vector.
3579/// \param __w6
3580/// A 16-bit integer value used to initialize bits [111:96] of the
3581/// destination vector.
3582/// \param __w5
3583/// A 16-bit integer value used to initialize bits [95:80] of the destination
3584/// vector.
3585/// \param __w4
3586/// A 16-bit integer value used to initialize bits [79:64] of the destination
3587/// vector.
3588/// \param __w3
3589/// A 16-bit integer value used to initialize bits [63:48] of the destination
3590/// vector.
3591/// \param __w2
3592/// A 16-bit integer value used to initialize bits [47:32] of the destination
3593/// vector.
3594/// \param __w1
3595/// A 16-bit integer value used to initialize bits [31:16] of the destination
3596/// vector.
3597/// \param __w0
3598/// A 16-bit integer value used to initialize bits [15:0] of the destination
3599/// vector.
3600/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3601/// provided in the operands.
3602static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3603_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3604 short __w2, short __w1, short __w0) {
3605 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3606 __w4, __w5, __w6, __w7};
3607}
3608
3609/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3610/// the specified 8-bit integer values.
3611///
3612/// \headerfile <x86intrin.h>
3613///
3614/// This intrinsic is a utility function and does not correspond to a specific
3615/// instruction.
3616///
3617/// \param __b15
3618/// Initializes bits [127:120] of the destination vector.
3619/// \param __b14
3620/// Initializes bits [119:112] of the destination vector.
3621/// \param __b13
3622/// Initializes bits [111:104] of the destination vector.
3623/// \param __b12
3624/// Initializes bits [103:96] of the destination vector.
3625/// \param __b11
3626/// Initializes bits [95:88] of the destination vector.
3627/// \param __b10
3628/// Initializes bits [87:80] of the destination vector.
3629/// \param __b9
3630/// Initializes bits [79:72] of the destination vector.
3631/// \param __b8
3632/// Initializes bits [71:64] of the destination vector.
3633/// \param __b7
3634/// Initializes bits [63:56] of the destination vector.
3635/// \param __b6
3636/// Initializes bits [55:48] of the destination vector.
3637/// \param __b5
3638/// Initializes bits [47:40] of the destination vector.
3639/// \param __b4
3640/// Initializes bits [39:32] of the destination vector.
3641/// \param __b3
3642/// Initializes bits [31:24] of the destination vector.
3643/// \param __b2
3644/// Initializes bits [23:16] of the destination vector.
3645/// \param __b1
3646/// Initializes bits [15:8] of the destination vector.
3647/// \param __b0
3648/// Initializes bits [7:0] of the destination vector.
3649/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3650/// provided in the operands.
3651static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3652_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3653 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3654 char __b4, char __b3, char __b2, char __b1, char __b0) {
3655 return __extension__(__m128i)(__v16qi){
3656 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3657 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3658}
3659
3660/// Initializes both values in a 128-bit integer vector with the
3661/// specified 64-bit integer value.
3662///
3663/// \headerfile <x86intrin.h>
3664///
3665/// This intrinsic is a utility function and does not correspond to a specific
3666/// instruction.
3667///
3668/// \param __q
3669/// Integer value used to initialize the elements of the destination integer
3670/// vector.
3671/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3672/// elements containing the value provided in the operand.
3673static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3674_mm_set1_epi64x(long long __q) {
3675 return _mm_set_epi64x(__q, __q);
3676}
3677
3678/// Initializes both values in a 128-bit vector of [2 x i64] with the
3679/// specified 64-bit value.
3680///
3681/// \headerfile <x86intrin.h>
3682///
3683/// This intrinsic is a utility function and does not correspond to a specific
3684/// instruction.
3685///
3686/// \param __q
3687/// A 64-bit value used to initialize the elements of the destination integer
3688/// vector.
3689/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3690/// containing the value provided in the operand.
3691static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3692_mm_set1_epi64(__m64 __q) {
3693 return _mm_set_epi64(__q, __q);
3694}
3695
3696/// Initializes all values in a 128-bit vector of [4 x i32] with the
3697/// specified 32-bit value.
3698///
3699/// \headerfile <x86intrin.h>
3700///
3701/// This intrinsic is a utility function and does not correspond to a specific
3702/// instruction.
3703///
3704/// \param __i
3705/// A 32-bit value used to initialize the elements of the destination integer
3706/// vector.
3707/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3708/// containing the value provided in the operand.
3709static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) {
3710 return _mm_set_epi32(__i, __i, __i, __i);
3711}
3712
3713/// Initializes all values in a 128-bit vector of [8 x i16] with the
3714/// specified 16-bit value.
3715///
3716/// \headerfile <x86intrin.h>
3717///
3718/// This intrinsic is a utility function and does not correspond to a specific
3719/// instruction.
3720///
3721/// \param __w
3722/// A 16-bit value used to initialize the elements of the destination integer
3723/// vector.
3724/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3725/// containing the value provided in the operand.
3726static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3727_mm_set1_epi16(short __w) {
3728 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3729}
3730
3731/// Initializes all values in a 128-bit vector of [16 x i8] with the
3732/// specified 8-bit value.
3733///
3734/// \headerfile <x86intrin.h>
3735///
3736/// This intrinsic is a utility function and does not correspond to a specific
3737/// instruction.
3738///
3739/// \param __b
3740/// An 8-bit value used to initialize the elements of the destination integer
3741/// vector.
3742/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3743/// containing the value provided in the operand.
3744static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) {
3745 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3746 __b, __b, __b, __b, __b);
3747}
3748
3749/// Constructs a 128-bit integer vector, initialized in reverse order
3750/// with the specified 64-bit integral values.
3751///
3752/// \headerfile <x86intrin.h>
3753///
3754/// This intrinsic does not correspond to a specific instruction.
3755///
3756/// \param __q0
3757/// A 64-bit integral value used to initialize the lower 64 bits of the
3758/// result.
3759/// \param __q1
3760/// A 64-bit integral value used to initialize the upper 64 bits of the
3761/// result.
3762/// \returns An initialized 128-bit integer vector.
3763static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3764_mm_setr_epi64(__m64 __q0, __m64 __q1) {
3765 return _mm_set_epi64(__q1, __q0);
3766}
3767
3768/// Constructs a 128-bit integer vector, initialized in reverse order
3769/// with the specified 32-bit integral values.
3770///
3771/// \headerfile <x86intrin.h>
3772///
3773/// This intrinsic is a utility function and does not correspond to a specific
3774/// instruction.
3775///
3776/// \param __i0
3777/// A 32-bit integral value used to initialize bits [31:0] of the result.
3778/// \param __i1
3779/// A 32-bit integral value used to initialize bits [63:32] of the result.
3780/// \param __i2
3781/// A 32-bit integral value used to initialize bits [95:64] of the result.
3782/// \param __i3
3783/// A 32-bit integral value used to initialize bits [127:96] of the result.
3784/// \returns An initialized 128-bit integer vector.
3785static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3786_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) {
3787 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3788}
3789
3790/// Constructs a 128-bit integer vector, initialized in reverse order
3791/// with the specified 16-bit integral values.
3792///
3793/// \headerfile <x86intrin.h>
3794///
3795/// This intrinsic is a utility function and does not correspond to a specific
3796/// instruction.
3797///
3798/// \param __w0
3799/// A 16-bit integral value used to initialize bits [15:0] of the result.
3800/// \param __w1
3801/// A 16-bit integral value used to initialize bits [31:16] of the result.
3802/// \param __w2
3803/// A 16-bit integral value used to initialize bits [47:32] of the result.
3804/// \param __w3
3805/// A 16-bit integral value used to initialize bits [63:48] of the result.
3806/// \param __w4
3807/// A 16-bit integral value used to initialize bits [79:64] of the result.
3808/// \param __w5
3809/// A 16-bit integral value used to initialize bits [95:80] of the result.
3810/// \param __w6
3811/// A 16-bit integral value used to initialize bits [111:96] of the result.
3812/// \param __w7
3813/// A 16-bit integral value used to initialize bits [127:112] of the result.
3814/// \returns An initialized 128-bit integer vector.
3815static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3816_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3817 short __w5, short __w6, short __w7) {
3818 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3819}
3820
3821/// Constructs a 128-bit integer vector, initialized in reverse order
3822/// with the specified 8-bit integral values.
3823///
3824/// \headerfile <x86intrin.h>
3825///
3826/// This intrinsic is a utility function and does not correspond to a specific
3827/// instruction.
3828///
3829/// \param __b0
3830/// An 8-bit integral value used to initialize bits [7:0] of the result.
3831/// \param __b1
3832/// An 8-bit integral value used to initialize bits [15:8] of the result.
3833/// \param __b2
3834/// An 8-bit integral value used to initialize bits [23:16] of the result.
3835/// \param __b3
3836/// An 8-bit integral value used to initialize bits [31:24] of the result.
3837/// \param __b4
3838/// An 8-bit integral value used to initialize bits [39:32] of the result.
3839/// \param __b5
3840/// An 8-bit integral value used to initialize bits [47:40] of the result.
3841/// \param __b6
3842/// An 8-bit integral value used to initialize bits [55:48] of the result.
3843/// \param __b7
3844/// An 8-bit integral value used to initialize bits [63:56] of the result.
3845/// \param __b8
3846/// An 8-bit integral value used to initialize bits [71:64] of the result.
3847/// \param __b9
3848/// An 8-bit integral value used to initialize bits [79:72] of the result.
3849/// \param __b10
3850/// An 8-bit integral value used to initialize bits [87:80] of the result.
3851/// \param __b11
3852/// An 8-bit integral value used to initialize bits [95:88] of the result.
3853/// \param __b12
3854/// An 8-bit integral value used to initialize bits [103:96] of the result.
3855/// \param __b13
3856/// An 8-bit integral value used to initialize bits [111:104] of the result.
3857/// \param __b14
3858/// An 8-bit integral value used to initialize bits [119:112] of the result.
3859/// \param __b15
3860/// An 8-bit integral value used to initialize bits [127:120] of the result.
3861/// \returns An initialized 128-bit integer vector.
3862static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3863_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3864 char __b6, char __b7, char __b8, char __b9, char __b10,
3865 char __b11, char __b12, char __b13, char __b14, char __b15) {
3866 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3867 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3868}
3869
3870/// Creates a 128-bit integer vector initialized to zero.
3871///
3872/// \headerfile <x86intrin.h>
3873///
3874/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3875///
3876/// \returns An initialized 128-bit integer vector with all elements set to
3877/// zero.
3878static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) {
3879 return __extension__(__m128i)(__v2di){0LL, 0LL};
3880}
3881
3882/// Stores a 128-bit integer vector to a memory location aligned on a
3883/// 128-bit boundary.
3884///
3885/// \headerfile <x86intrin.h>
3886///
3887/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3888///
3889/// \param __p
3890/// A pointer to an aligned memory location that will receive the integer
3891/// values.
3892/// \param __b
3893/// A 128-bit integer vector containing the values to be moved.
3894static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3895 __m128i __b) {
3896 *__p = __b;
3897}
3898
3899/// Stores a 128-bit integer vector to an unaligned memory location.
3900///
3901/// \headerfile <x86intrin.h>
3902///
3903/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3904///
3905/// \param __p
3906/// A pointer to a memory location that will receive the integer values.
3907/// \param __b
3908/// A 128-bit integer vector containing the values to be moved.
3909static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3910 __m128i __b) {
3911 struct __storeu_si128 {
3912 __m128i_u __v;
3913 } __attribute__((__packed__, __may_alias__));
3914 ((struct __storeu_si128 *)__p)->__v = __b;
3915}
3916
3917/// Stores a 64-bit integer value from the low element of a 128-bit integer
3918/// vector.
3919///
3920/// \headerfile <x86intrin.h>
3921///
3922/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3923///
3924/// \param __p
3925/// A pointer to a 64-bit memory location. The address of the memory
3926/// location does not have to be aligned.
3927/// \param __b
3928/// A 128-bit integer vector containing the value to be stored.
3929static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3930 __m128i __b) {
3931 struct __storeu_si64 {
3932 long long __v;
3933 } __attribute__((__packed__, __may_alias__));
3934 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3935}
3936
3937/// Stores a 32-bit integer value from the low element of a 128-bit integer
3938/// vector.
3939///
3940/// \headerfile <x86intrin.h>
3941///
3942/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3943///
3944/// \param __p
3945/// A pointer to a 32-bit memory location. The address of the memory
3946/// location does not have to be aligned.
3947/// \param __b
3948/// A 128-bit integer vector containing the value to be stored.
3949static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3950 __m128i __b) {
3951 struct __storeu_si32 {
3952 int __v;
3953 } __attribute__((__packed__, __may_alias__));
3954 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3955}
3956
3957/// Stores a 16-bit integer value from the low element of a 128-bit integer
3958/// vector.
3959///
3960/// \headerfile <x86intrin.h>
3961///
3962/// This intrinsic does not correspond to a specific instruction.
3963///
3964/// \param __p
3965/// A pointer to a 16-bit memory location. The address of the memory
3966/// location does not have to be aligned.
3967/// \param __b
3968/// A 128-bit integer vector containing the value to be stored.
3969static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3970 __m128i __b) {
3971 struct __storeu_si16 {
3972 short __v;
3973 } __attribute__((__packed__, __may_alias__));
3974 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3975}
3976
3977/// Moves bytes selected by the mask from the first operand to the
3978/// specified unaligned memory location. When a mask bit is 1, the
3979/// corresponding byte is written, otherwise it is not written.
3980///
3981/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3982/// used again soon). Exception and trap behavior for elements not selected
3983/// for storage to memory are implementation dependent.
3984///
3985/// \headerfile <x86intrin.h>
3986///
3987/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3988/// instruction.
3989///
3990/// \param __d
3991/// A 128-bit integer vector containing the values to be moved.
3992/// \param __n
3993/// A 128-bit integer vector containing the mask. The most significant bit of
3994/// each byte represents the mask bits.
3995/// \param __p
3996/// A pointer to an unaligned 128-bit memory location where the specified
3997/// values are moved.
3998static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3999 __m128i __n,
4000 char *__p) {
4001 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4002}
4003
4004/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4005/// a memory location.
4006///
4007/// \headerfile <x86intrin.h>
4008///
4009/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4010///
4011/// \param __p
4012/// A pointer to a 64-bit memory location that will receive the lower 64 bits
4013/// of the integer vector parameter.
4014/// \param __a
4015/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4016/// value to be stored.
4017static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4018 __m128i __a) {
4019 struct __mm_storel_epi64_struct {
4020 long long __u;
4021 } __attribute__((__packed__, __may_alias__));
4022 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4023}
4024
4025/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4026/// aligned memory location.
4027///
4028/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4029/// used again soon).
4030///
4031/// \headerfile <x86intrin.h>
4032///
4033/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4034///
4035/// \param __p
4036/// A pointer to the 128-bit aligned memory location used to store the value.
4037/// \param __a
4038/// A vector of [2 x double] containing the 64-bit values to be stored.
4039static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4040 __m128d __a) {
4041 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4042}
4043
4044/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4045///
4046/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4047/// used again soon).
4048///
4049/// \headerfile <x86intrin.h>
4050///
4051/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4052///
4053/// \param __p
4054/// A pointer to the 128-bit aligned memory location used to store the value.
4055/// \param __a
4056/// A 128-bit integer vector containing the values to be stored.
4057static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4058 __m128i __a) {
4059 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4060}
4061
4062/// Stores a 32-bit integer value in the specified memory location.
4063///
4064/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4065/// used again soon).
4066///
4067/// \headerfile <x86intrin.h>
4068///
4069/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4070///
4071/// \param __p
4072/// A pointer to the 32-bit memory location used to store the value.
4073/// \param __a
4074/// A 32-bit integer containing the value to be stored.
4075static __inline__ void
4076 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4077 _mm_stream_si32(void *__p, int __a) {
4078 __builtin_ia32_movnti((int *)__p, __a);
4079}
4080
4081#ifdef __x86_64__
4082/// Stores a 64-bit integer value in the specified memory location.
4083///
4084/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4085/// used again soon).
4086///
4087/// \headerfile <x86intrin.h>
4088///
4089/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4090///
4091/// \param __p
4092/// A pointer to the 64-bit memory location used to store the value.
4093/// \param __a
4094/// A 64-bit integer containing the value to be stored.
4095static __inline__ void
4096 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4097 _mm_stream_si64(void *__p, long long __a) {
4098 __builtin_ia32_movnti64((long long *)__p, __a);
4099}
4100#endif
4101
4102#if defined(__cplusplus)
4103extern "C" {
4104#endif
4105
4106/// The cache line containing \a __p is flushed and invalidated from all
4107/// caches in the coherency domain.
4108///
4109/// \headerfile <x86intrin.h>
4110///
4111/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4112///
4113/// \param __p
4114/// A pointer to the memory location used to identify the cache line to be
4115/// flushed.
4116void _mm_clflush(void const *__p);
4117
4118/// Forces strong memory ordering (serialization) between load
4119/// instructions preceding this instruction and load instructions following
4120/// this instruction, ensuring the system completes all previous loads before
4121/// executing subsequent loads.
4122///
4123/// \headerfile <x86intrin.h>
4124///
4125/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4126///
4127void _mm_lfence(void);
4128
4129/// Forces strong memory ordering (serialization) between load and store
4130/// instructions preceding this instruction and load and store instructions
4131/// following this instruction, ensuring that the system completes all
4132/// previous memory accesses before executing subsequent memory accesses.
4133///
4134/// \headerfile <x86intrin.h>
4135///
4136/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4137///
4138void _mm_mfence(void);
4139
4140#if defined(__cplusplus)
4141} // extern "C"
4142#endif
4143
4144/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4145/// vector operands into 8-bit signed integers, and packs the results into
4146/// the destination.
4147///
4148/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4149/// less than 0x80 are saturated to 0x80.
4150///
4151/// \headerfile <x86intrin.h>
4152///
4153/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4154///
4155/// \param __a
4156/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4157/// written to the lower 64 bits of the result.
4158/// \param __b
4159/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4160/// written to the higher 64 bits of the result.
4161/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4162static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4163_mm_packs_epi16(__m128i __a, __m128i __b) {
4164 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4165}
4166
4167/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4168/// vector operands into 16-bit signed integers, and packs the results into
4169/// the destination.
4170///
4171/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4172/// values less than 0x8000 are saturated to 0x8000.
4173///
4174/// \headerfile <x86intrin.h>
4175///
4176/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4177///
4178/// \param __a
4179/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4180/// are written to the lower 64 bits of the result.
4181/// \param __b
4182/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4183/// are written to the higher 64 bits of the result.
4184/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4185static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4186_mm_packs_epi32(__m128i __a, __m128i __b) {
4187 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4188}
4189
4190/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4191/// vector operands into 8-bit unsigned integers, and packs the results into
4192/// the destination.
4193///
4194/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4195/// are saturated to 0x00.
4196///
4197/// \headerfile <x86intrin.h>
4198///
4199/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4200///
4201/// \param __a
4202/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4203/// written to the lower 64 bits of the result.
4204/// \param __b
4205/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4206/// written to the higher 64 bits of the result.
4207/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4208static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4209_mm_packus_epi16(__m128i __a, __m128i __b) {
4210 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4211}
4212
4213/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4214/// the immediate-value parameter as a selector.
4215///
4216/// \headerfile <x86intrin.h>
4217///
4218/// \code
4219/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4220/// \endcode
4221///
4222/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4223///
4224/// \param a
4225/// A 128-bit integer vector.
4226/// \param imm
4227/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4228/// to bits[15:0] of the result. \n
4229/// 000: assign values from bits [15:0] of \a a. \n
4230/// 001: assign values from bits [31:16] of \a a. \n
4231/// 010: assign values from bits [47:32] of \a a. \n
4232/// 011: assign values from bits [63:48] of \a a. \n
4233/// 100: assign values from bits [79:64] of \a a. \n
4234/// 101: assign values from bits [95:80] of \a a. \n
4235/// 110: assign values from bits [111:96] of \a a. \n
4236/// 111: assign values from bits [127:112] of \a a.
4237/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4238/// integer vector parameter and the remaining bits are assigned zeros.
4239#define _mm_extract_epi16(a, imm) \
4240 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4241 (int)(imm)))
4242
4243/// Constructs a 128-bit integer vector by first making a copy of the
4244/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4245/// of an integer parameter into an offset specified by the immediate-value
4246/// parameter.
4247///
4248/// \headerfile <x86intrin.h>
4249///
4250/// \code
4251/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4252/// \endcode
4253///
4254/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4255///
4256/// \param a
4257/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4258/// result and then one of the eight elements in the result is replaced by
4259/// the lower 16 bits of \a b.
4260/// \param b
4261/// An integer. The lower 16 bits of this parameter are written to the
4262/// result beginning at an offset specified by \a imm.
4263/// \param imm
4264/// An immediate value specifying the bit offset in the result at which the
4265/// lower 16 bits of \a b are written.
4266/// \returns A 128-bit integer vector containing the constructed values.
4267#define _mm_insert_epi16(a, b, imm) \
4268 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4269 (int)(imm)))
4270
4271/// Copies the values of the most significant bits from each 8-bit
4272/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4273/// value, zero-extends the value, and writes it to the destination.
4274///
4275/// \headerfile <x86intrin.h>
4276///
4277/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4278///
4279/// \param __a
4280/// A 128-bit integer vector containing the values with bits to be extracted.
4281/// \returns The most significant bits from each 8-bit element in \a __a,
4282/// written to bits [15:0]. The other bits are assigned zeros.
4283static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
4285 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4286}
4287
4288/// Constructs a 128-bit integer vector by shuffling four 32-bit
4289/// elements of a 128-bit integer vector parameter, using the immediate-value
4290/// parameter as a specifier.
4291///
4292/// \headerfile <x86intrin.h>
4293///
4294/// \code
4295/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4296/// \endcode
4297///
4298/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4299///
4300/// \param a
4301/// A 128-bit integer vector containing the values to be copied.
4302/// \param imm
4303/// An immediate value containing an 8-bit value specifying which elements to
4304/// copy from a. The destinations within the 128-bit destination are assigned
4305/// values as follows: \n
4306/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4307/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4308/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4309/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4310/// Bit value assignments: \n
4311/// 00: assign values from bits [31:0] of \a a. \n
4312/// 01: assign values from bits [63:32] of \a a. \n
4313/// 10: assign values from bits [95:64] of \a a. \n
4314/// 11: assign values from bits [127:96] of \a a. \n
4315/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4316/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4317/// <c>[b6, b4, b2, b0]</c>.
4318/// \returns A 128-bit integer vector containing the shuffled values.
4319#define _mm_shuffle_epi32(a, imm) \
4320 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4321
4322/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4323/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4324/// value parameter as a specifier.
4325///
4326/// \headerfile <x86intrin.h>
4327///
4328/// \code
4329/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4330/// \endcode
4331///
4332/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4333///
4334/// \param a
4335/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4336/// [127:64] of the result.
4337/// \param imm
4338/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4339/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4340/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4341/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4342/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4343/// Bit value assignments: \n
4344/// 00: assign values from bits [15:0] of \a a. \n
4345/// 01: assign values from bits [31:16] of \a a. \n
4346/// 10: assign values from bits [47:32] of \a a. \n
4347/// 11: assign values from bits [63:48] of \a a. \n
4348/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4349/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4350/// <c>[b6, b4, b2, b0]</c>.
4351/// \returns A 128-bit integer vector containing the shuffled values.
4352#define _mm_shufflelo_epi16(a, imm) \
4353 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4354
4355/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4356/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4357/// value parameter as a specifier.
4358///
4359/// \headerfile <x86intrin.h>
4360///
4361/// \code
4362/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4363/// \endcode
4364///
4365/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4366///
4367/// \param a
4368/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4369/// [63:0] of the result.
4370/// \param imm
4371/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4372/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4373/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4374/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4375/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4376/// Bit value assignments: \n
4377/// 00: assign values from bits [79:64] of \a a. \n
4378/// 01: assign values from bits [95:80] of \a a. \n
4379/// 10: assign values from bits [111:96] of \a a. \n
4380/// 11: assign values from bits [127:112] of \a a. \n
4381/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4382/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4383/// <c>[b6, b4, b2, b0]</c>.
4384/// \returns A 128-bit integer vector containing the shuffled values.
4385#define _mm_shufflehi_epi16(a, imm) \
4386 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4387
4388/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4389/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4390///
4391/// \headerfile <x86intrin.h>
4392///
4393/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4394/// instruction.
4395///
4396/// \param __a
4397/// A 128-bit vector of [16 x i8].
4398/// Bits [71:64] are written to bits [7:0] of the result. \n
4399/// Bits [79:72] are written to bits [23:16] of the result. \n
4400/// Bits [87:80] are written to bits [39:32] of the result. \n
4401/// Bits [95:88] are written to bits [55:48] of the result. \n
4402/// Bits [103:96] are written to bits [71:64] of the result. \n
4403/// Bits [111:104] are written to bits [87:80] of the result. \n
4404/// Bits [119:112] are written to bits [103:96] of the result. \n
4405/// Bits [127:120] are written to bits [119:112] of the result.
4406/// \param __b
4407/// A 128-bit vector of [16 x i8]. \n
4408/// Bits [71:64] are written to bits [15:8] of the result. \n
4409/// Bits [79:72] are written to bits [31:24] of the result. \n
4410/// Bits [87:80] are written to bits [47:40] of the result. \n
4411/// Bits [95:88] are written to bits [63:56] of the result. \n
4412/// Bits [103:96] are written to bits [79:72] of the result. \n
4413/// Bits [111:104] are written to bits [95:88] of the result. \n
4414/// Bits [119:112] are written to bits [111:104] of the result. \n
4415/// Bits [127:120] are written to bits [127:120] of the result.
4416/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4417static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4418_mm_unpackhi_epi8(__m128i __a, __m128i __b) {
4419 return (__m128i)__builtin_shufflevector(
4420 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4421 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4422}
4423
4424/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4425/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4426///
4427/// \headerfile <x86intrin.h>
4428///
4429/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4430/// instruction.
4431///
4432/// \param __a
4433/// A 128-bit vector of [8 x i16].
4434/// Bits [79:64] are written to bits [15:0] of the result. \n
4435/// Bits [95:80] are written to bits [47:32] of the result. \n
4436/// Bits [111:96] are written to bits [79:64] of the result. \n
4437/// Bits [127:112] are written to bits [111:96] of the result.
4438/// \param __b
4439/// A 128-bit vector of [8 x i16].
4440/// Bits [79:64] are written to bits [31:16] of the result. \n
4441/// Bits [95:80] are written to bits [63:48] of the result. \n
4442/// Bits [111:96] are written to bits [95:80] of the result. \n
4443/// Bits [127:112] are written to bits [127:112] of the result.
4444/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4445static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4446_mm_unpackhi_epi16(__m128i __a, __m128i __b) {
4447 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4448 8 + 5, 6, 8 + 6, 7, 8 + 7);
4449}
4450
4451/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4452/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4453///
4454/// \headerfile <x86intrin.h>
4455///
4456/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4457/// instruction.
4458///
4459/// \param __a
4460/// A 128-bit vector of [4 x i32]. \n
4461/// Bits [95:64] are written to bits [31:0] of the destination. \n
4462/// Bits [127:96] are written to bits [95:64] of the destination.
4463/// \param __b
4464/// A 128-bit vector of [4 x i32]. \n
4465/// Bits [95:64] are written to bits [64:32] of the destination. \n
4466/// Bits [127:96] are written to bits [127:96] of the destination.
4467/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4468static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4469_mm_unpackhi_epi32(__m128i __a, __m128i __b) {
4470 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4471 4 + 3);
4472}
4473
4474/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4475/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4476///
4477/// \headerfile <x86intrin.h>
4478///
4479/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4480/// instruction.
4481///
4482/// \param __a
4483/// A 128-bit vector of [2 x i64]. \n
4484/// Bits [127:64] are written to bits [63:0] of the destination.
4485/// \param __b
4486/// A 128-bit vector of [2 x i64]. \n
4487/// Bits [127:64] are written to bits [127:64] of the destination.
4488/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4489static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4490_mm_unpackhi_epi64(__m128i __a, __m128i __b) {
4491 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4492}
4493
4494/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4495/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4496///
4497/// \headerfile <x86intrin.h>
4498///
4499/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4500/// instruction.
4501///
4502/// \param __a
4503/// A 128-bit vector of [16 x i8]. \n
4504/// Bits [7:0] are written to bits [7:0] of the result. \n
4505/// Bits [15:8] are written to bits [23:16] of the result. \n
4506/// Bits [23:16] are written to bits [39:32] of the result. \n
4507/// Bits [31:24] are written to bits [55:48] of the result. \n
4508/// Bits [39:32] are written to bits [71:64] of the result. \n
4509/// Bits [47:40] are written to bits [87:80] of the result. \n
4510/// Bits [55:48] are written to bits [103:96] of the result. \n
4511/// Bits [63:56] are written to bits [119:112] of the result.
4512/// \param __b
4513/// A 128-bit vector of [16 x i8].
4514/// Bits [7:0] are written to bits [15:8] of the result. \n
4515/// Bits [15:8] are written to bits [31:24] of the result. \n
4516/// Bits [23:16] are written to bits [47:40] of the result. \n
4517/// Bits [31:24] are written to bits [63:56] of the result. \n
4518/// Bits [39:32] are written to bits [79:72] of the result. \n
4519/// Bits [47:40] are written to bits [95:88] of the result. \n
4520/// Bits [55:48] are written to bits [111:104] of the result. \n
4521/// Bits [63:56] are written to bits [127:120] of the result.
4522/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4523static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4524_mm_unpacklo_epi8(__m128i __a, __m128i __b) {
4525 return (__m128i)__builtin_shufflevector(
4526 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4527 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4528}
4529
4530/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4531/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4532/// [8 x i16].
4533///
4534/// \headerfile <x86intrin.h>
4535///
4536/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4537/// instruction.
4538///
4539/// \param __a
4540/// A 128-bit vector of [8 x i16].
4541/// Bits [15:0] are written to bits [15:0] of the result. \n
4542/// Bits [31:16] are written to bits [47:32] of the result. \n
4543/// Bits [47:32] are written to bits [79:64] of the result. \n
4544/// Bits [63:48] are written to bits [111:96] of the result.
4545/// \param __b
4546/// A 128-bit vector of [8 x i16].
4547/// Bits [15:0] are written to bits [31:16] of the result. \n
4548/// Bits [31:16] are written to bits [63:48] of the result. \n
4549/// Bits [47:32] are written to bits [95:80] of the result. \n
4550/// Bits [63:48] are written to bits [127:112] of the result.
4551/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4552static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4553_mm_unpacklo_epi16(__m128i __a, __m128i __b) {
4554 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4555 8 + 1, 2, 8 + 2, 3, 8 + 3);
4556}
4557
4558/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4559/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4560///
4561/// \headerfile <x86intrin.h>
4562///
4563/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4564/// instruction.
4565///
4566/// \param __a
4567/// A 128-bit vector of [4 x i32]. \n
4568/// Bits [31:0] are written to bits [31:0] of the destination. \n
4569/// Bits [63:32] are written to bits [95:64] of the destination.
4570/// \param __b
4571/// A 128-bit vector of [4 x i32]. \n
4572/// Bits [31:0] are written to bits [64:32] of the destination. \n
4573/// Bits [63:32] are written to bits [127:96] of the destination.
4574/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4575static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4576_mm_unpacklo_epi32(__m128i __a, __m128i __b) {
4577 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4578 4 + 1);
4579}
4580
4581/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4582/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4583///
4584/// \headerfile <x86intrin.h>
4585///
4586/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4587/// instruction.
4588///
4589/// \param __a
4590/// A 128-bit vector of [2 x i64]. \n
4591/// Bits [63:0] are written to bits [63:0] of the destination. \n
4592/// \param __b
4593/// A 128-bit vector of [2 x i64]. \n
4594/// Bits [63:0] are written to bits [127:64] of the destination. \n
4595/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4596static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4597_mm_unpacklo_epi64(__m128i __a, __m128i __b) {
4598 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4599}
4600
4601/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4602/// integer.
4603///
4604/// \headerfile <x86intrin.h>
4605///
4606/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4607///
4608/// \param __a
4609/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4610/// destination.
4611/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4612static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
4614 return (__m64)__a[0];
4615}
4616
4617/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4618/// upper bits.
4619///
4620/// \headerfile <x86intrin.h>
4621///
4622/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4623///
4624/// \param __a
4625/// A 64-bit value.
4626/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4627/// the operand. The upper 64 bits are assigned zeros.
4628static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4630 return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
4631}
4632
4633/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4634/// integer vector, zeroing the upper bits.
4635///
4636/// \headerfile <x86intrin.h>
4637///
4638/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4639///
4640/// \param __a
4641/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4642/// destination.
4643/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4644/// the operand. The upper 64 bits are assigned zeros.
4645static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4647 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4648}
4649
4650/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4651/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4652/// double].
4653///
4654/// \headerfile <x86intrin.h>
4655///
4656/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4657///
4658/// \param __a
4659/// A 128-bit vector of [2 x double]. \n
4660/// Bits [127:64] are written to bits [63:0] of the destination.
4661/// \param __b
4662/// A 128-bit vector of [2 x double]. \n
4663/// Bits [127:64] are written to bits [127:64] of the destination.
4664/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4665static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4666_mm_unpackhi_pd(__m128d __a, __m128d __b) {
4667 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4668}
4669
4670/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4671/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4672/// double].
4673///
4674/// \headerfile <x86intrin.h>
4675///
4676/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4677///
4678/// \param __a
4679/// A 128-bit vector of [2 x double]. \n
4680/// Bits [63:0] are written to bits [63:0] of the destination.
4681/// \param __b
4682/// A 128-bit vector of [2 x double]. \n
4683/// Bits [63:0] are written to bits [127:64] of the destination.
4684/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4685static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4686_mm_unpacklo_pd(__m128d __a, __m128d __b) {
4687 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4688}
4689
4690/// Extracts the sign bits of the double-precision values in the 128-bit
4691/// vector of [2 x double], zero-extends the value, and writes it to the
4692/// low-order bits of the destination.
4693///
4694/// \headerfile <x86intrin.h>
4695///
4696/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4697///
4698/// \param __a
4699/// A 128-bit vector of [2 x double] containing the values with sign bits to
4700/// be extracted.
4701/// \returns The sign bits from each of the double-precision elements in \a __a,
4702/// written to bits [1:0]. The remaining bits are assigned values of zero.
4703static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
4705 return __builtin_ia32_movmskpd((__v2df)__a);
4706}
4707
4708/// Constructs a 128-bit floating-point vector of [2 x double] from two
4709/// 128-bit vector parameters of [2 x double], using the immediate-value
4710/// parameter as a specifier.
4711///
4712/// \headerfile <x86intrin.h>
4713///
4714/// \code
4715/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4716/// \endcode
4717///
4718/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4719///
4720/// \param a
4721/// A 128-bit vector of [2 x double].
4722/// \param b
4723/// A 128-bit vector of [2 x double].
4724/// \param i
4725/// An 8-bit immediate value. The least significant two bits specify which
4726/// elements to copy from \a a and \a b: \n
4727/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4728/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4729/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4730/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4731/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4732/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4733/// <c>[b1, b0]</c>.
4734/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4735#define _mm_shuffle_pd(a, b, i) \
4736 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4737 (int)(i)))
4738
4739/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4740/// floating-point vector of [4 x float].
4741///
4742/// \headerfile <x86intrin.h>
4743///
4744/// This intrinsic has no corresponding instruction.
4745///
4746/// \param __a
4747/// A 128-bit floating-point vector of [2 x double].
4748/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4749/// bitwise pattern as the parameter.
4750static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4752 return (__m128)__a;
4753}
4754
4755/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4756/// integer vector.
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// This intrinsic has no corresponding instruction.
4761///
4762/// \param __a
4763/// A 128-bit floating-point vector of [2 x double].
4764/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4765/// parameter.
4766static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4768 return (__m128i)__a;
4769}
4770
4771/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4772/// floating-point vector of [2 x double].
4773///
4774/// \headerfile <x86intrin.h>
4775///
4776/// This intrinsic has no corresponding instruction.
4777///
4778/// \param __a
4779/// A 128-bit floating-point vector of [4 x float].
4780/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4781/// bitwise pattern as the parameter.
4782static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4784 return (__m128d)__a;
4785}
4786
4787/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4788/// integer vector.
4789///
4790/// \headerfile <x86intrin.h>
4791///
4792/// This intrinsic has no corresponding instruction.
4793///
4794/// \param __a
4795/// A 128-bit floating-point vector of [4 x float].
4796/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4797/// parameter.
4798static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4800 return (__m128i)__a;
4801}
4802
4803/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4804/// of [4 x float].
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// This intrinsic has no corresponding instruction.
4809///
4810/// \param __a
4811/// A 128-bit integer vector.
4812/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4813/// bitwise pattern as the parameter.
4814static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4816 return (__m128)__a;
4817}
4818
4819/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4820/// of [2 x double].
4821///
4822/// \headerfile <x86intrin.h>
4823///
4824/// This intrinsic has no corresponding instruction.
4825///
4826/// \param __a
4827/// A 128-bit integer vector.
4828/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4829/// bitwise pattern as the parameter.
4830static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4832 return (__m128d)__a;
4833}
4834
4835/// Compares each of the corresponding double-precision values of two
4836/// 128-bit vectors of [2 x double], using the operation specified by the
4837/// immediate integer operand.
4838///
4839/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4840/// If either value in a comparison is NaN, comparisons that are ordered
4841/// return false, and comparisons that are unordered return true.
4842///
4843/// \headerfile <x86intrin.h>
4844///
4845/// \code
4846/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4847/// \endcode
4848///
4849/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4850///
4851/// \param a
4852/// A 128-bit vector of [2 x double].
4853/// \param b
4854/// A 128-bit vector of [2 x double].
4855/// \param c
4856/// An immediate integer operand, with bits [4:0] specifying which comparison
4857/// operation to use: \n
4858/// 0x00: Equal (ordered, non-signaling) \n
4859/// 0x01: Less-than (ordered, signaling) \n
4860/// 0x02: Less-than-or-equal (ordered, signaling) \n
4861/// 0x03: Unordered (non-signaling) \n
4862/// 0x04: Not-equal (unordered, non-signaling) \n
4863/// 0x05: Not-less-than (unordered, signaling) \n
4864/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4865/// 0x07: Ordered (non-signaling) \n
4866/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4867#define _mm_cmp_pd(a, b, c) \
4868 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4869 (c)))
4870
4871/// Compares each of the corresponding scalar double-precision values of
4872/// two 128-bit vectors of [2 x double], using the operation specified by the
4873/// immediate integer operand.
4874///
4875/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4876/// If either value in a comparison is NaN, comparisons that are ordered
4877/// return false, and comparisons that are unordered return true.
4878///
4879/// \headerfile <x86intrin.h>
4880///
4881/// \code
4882/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4883/// \endcode
4884///
4885/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4886///
4887/// \param a
4888/// A 128-bit vector of [2 x double].
4889/// \param b
4890/// A 128-bit vector of [2 x double].
4891/// \param c
4892/// An immediate integer operand, with bits [4:0] specifying which comparison
4893/// operation to use: \n
4894/// 0x00: Equal (ordered, non-signaling) \n
4895/// 0x01: Less-than (ordered, signaling) \n
4896/// 0x02: Less-than-or-equal (ordered, signaling) \n
4897/// 0x03: Unordered (non-signaling) \n
4898/// 0x04: Not-equal (unordered, non-signaling) \n
4899/// 0x05: Not-less-than (unordered, signaling) \n
4900/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4901/// 0x07: Ordered (non-signaling) \n
4902/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4903#define _mm_cmp_sd(a, b, c) \
4904 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4905 (c)))
4906
4907#if defined(__cplusplus)
4908extern "C" {
4909#endif
4910
4911/// Indicates that a spin loop is being executed for the purposes of
4912/// optimizing power consumption during the loop.
4913///
4914/// \headerfile <x86intrin.h>
4915///
4916/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4917///
4918void _mm_pause(void);
4919
4920#if defined(__cplusplus)
4921} // extern "C"
4922#endif
4923
4924#undef __anyext128
4925#undef __trunc64
4926#undef __DEFAULT_FN_ATTRS
4927#undef __DEFAULT_FN_ATTRS_CONSTEXPR
4928
4929#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4930
4931#define _MM_DENORMALS_ZERO_ON (0x0040U)
4932#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4933
4934#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4935
4936#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4937#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4938 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4939
4940#endif /* __EMMINTRIN_H */
__device__ _Float16
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition emmintrin.h:1548
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1049
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition emmintrin.h:1963
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition emmintrin.h:4767
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2935
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition emmintrin.h:2657
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1025
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition emmintrin.h:4613
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition emmintrin.h:4704
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition emmintrin.h:218
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition emmintrin.h:3393
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition emmintrin.h:2370
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition emmintrin.h:1853
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition emmintrin.h:120
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition emmintrin.h:4186
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:590
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3050
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition emmintrin.h:2250
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2878
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition emmintrin.h:1656
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition emmintrin.h:4057
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:2996
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition emmintrin.h:2332
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:825
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3068
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition emmintrin.h:199
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1191
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition emmintrin.h:1619
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2859
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition emmintrin.h:3998
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1167
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1215
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:1783
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition emmintrin.h:1815
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1282
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition emmintrin.h:1563
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition emmintrin.h:4524
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3125
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3247
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:747
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition emmintrin.h:4017
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition emmintrin.h:80
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition emmintrin.h:4284
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:524
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition emmintrin.h:3969
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition emmintrin.h:420
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition emmintrin.h:303
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition emmintrin.h:1694
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition emmintrin.h:2269
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition emmintrin.h:3377
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition emmintrin.h:4553
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition emmintrin.h:1675
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:772
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition emmintrin.h:3426
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition emmintrin.h:2519
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition emmintrin.h:3727
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition emmintrin.h:2536
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition emmintrin.h:2063
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1143
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition emmintrin.h:2427
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition emmintrin.h:1943
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2841
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition emmintrin.h:403
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:798
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition emmintrin.h:1323
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:978
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition emmintrin.h:3535
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:722
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition emmintrin.h:2294
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:674
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition emmintrin.h:4209
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition emmintrin.h:3514
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition emmintrin.h:4751
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition emmintrin.h:2351
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition emmintrin.h:4783
static __inline__ void int __a
Definition emmintrin.h:4077
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3087
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition emmintrin.h:1833
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition emmintrin.h:2121
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition emmintrin.h:3949
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1497
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition emmintrin.h:2693
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition emmintrin.h:3764
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:482
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2823
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition emmintrin.h:1478
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition emmintrin.h:2571
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition emmintrin.h:1435
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition emmintrin.h:258
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition emmintrin.h:2593
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition emmintrin.h:177
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1345
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition emmintrin.h:3493
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2787
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1239
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition emmintrin.h:2709
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition emmintrin.h:3674
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition emmintrin.h:1637
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition emmintrin.h:3863
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition emmintrin.h:2209
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:611
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition emmintrin.h:1364
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition emmintrin.h:1745
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2769
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1765
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition emmintrin.h:4576
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition emmintrin.h:4646
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition emmintrin.h:2231
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition emmintrin.h:4799
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3227
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:2978
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition emmintrin.h:3692
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1097
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:569
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3014
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition emmintrin.h:366
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1001
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition emmintrin.h:3207
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition emmintrin.h:3106
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:1887
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition emmintrin.h:1603
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition emmintrin.h:4597
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1073
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:653
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3187
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition emmintrin.h:3145
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition emmintrin.h:3561
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition emmintrin.h:159
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition emmintrin.h:3603
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:2019
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition emmintrin.h:3816
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition emmintrin.h:2484
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition emmintrin.h:4418
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:952
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:852
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:2038
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:1924
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition emmintrin.h:3456
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition emmintrin.h:2165
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:927
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:503
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition emmintrin.h:2389
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition emmintrin.h:3441
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:697
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition emmintrin.h:4039
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition emmintrin.h:2464
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition emmintrin.h:2105
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:902
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition emmintrin.h:1301
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition emmintrin.h:2502
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:877
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition emmintrin.h:2444
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition emmintrin.h:3362
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition emmintrin.h:4469
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition emmintrin.h:1719
#define __trunc64(x)
Definition emmintrin.h:56
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition emmintrin.h:242
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition emmintrin.h:1410
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition emmintrin.h:282
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition emmintrin.h:98
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition emmintrin.h:3409
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition emmintrin.h:3929
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition emmintrin.h:1903
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition emmintrin.h:4629
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition emmintrin.h:4163
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition emmintrin.h:1579
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition emmintrin.h:4446
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition emmintrin.h:4666
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition emmintrin.h:2674
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition emmintrin.h:349
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2897
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition emmintrin.h:3894
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition emmintrin.h:3786
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition emmintrin.h:4686
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition emmintrin.h:4490
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2805
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition emmintrin.h:440
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition emmintrin.h:3032
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition emmintrin.h:2408
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition emmintrin.h:2313
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition emmintrin.h:328
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition emmintrin.h:3476
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition emmintrin.h:1388
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition emmintrin.h:4815
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition emmintrin.h:2553
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition emmintrin.h:2615
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition emmintrin.h:4831
#define __zext128(x)
Definition emmintrin.h:58
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition emmintrin.h:2143
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition emmintrin.h:2002
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition emmintrin.h:387
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition emmintrin.h:2084
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1516
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition emmintrin.h:3325
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition emmintrin.h:1532
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition emmintrin.h:3167
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition emmintrin.h:1980
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition emmintrin.h:3709
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1263
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition emmintrin.h:3652
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition emmintrin.h:1799
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition emmintrin.h:1121
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:546
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:461
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition emmintrin.h:3909
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition emmintrin.h:2187
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition emmintrin.h:19
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition emmintrin.h:138
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition emmintrin.h:1458
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition emmintrin.h:2636
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2916
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition emmintrin.h:3343
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition emmintrin.h:3744
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition emmintrin.h:2726
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition emmintrin.h:632
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition mmintrin.h:1273