clang 19.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26/* Type defines. */
27typedef double __v2df __attribute__((__vector_size__(16)));
28typedef long long __v2di __attribute__((__vector_size__(16)));
29typedef short __v8hi __attribute__((__vector_size__(16)));
30typedef char __v16qi __attribute__((__vector_size__(16)));
31
32/* Unsigned types */
33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37/* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41#ifdef __SSE2__
42/* Both _Float16 and __bf16 require SSE2 being enabled. */
43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49#endif
50
51/* Define the default attributes for the functions in this file. */
52#define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55#define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58
59/// Adds lower double-precision values in both operands and returns the
60/// sum in the lower 64 bits of the result. The upper 64 bits of the result
61/// are copied from the upper double-precision value of the first operand.
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66///
67/// \param __a
68/// A 128-bit vector of [2 x double] containing one of the source operands.
69/// \param __b
70/// A 128-bit vector of [2 x double] containing one of the source operands.
71/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73/// from the upper 64 bits of the first source operand.
74static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75 __m128d __b) {
76 __a[0] += __b[0];
77 return __a;
78}
79
80/// Adds two 128-bit vectors of [2 x double].
81///
82/// \headerfile <x86intrin.h>
83///
84/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85///
86/// \param __a
87/// A 128-bit vector of [2 x double] containing one of the source operands.
88/// \param __b
89/// A 128-bit vector of [2 x double] containing one of the source operands.
90/// \returns A 128-bit vector of [2 x double] containing the sums of both
91/// operands.
92static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93 __m128d __b) {
94 return (__m128d)((__v2df)__a + (__v2df)__b);
95}
96
97/// Subtracts the lower double-precision value of the second operand
98/// from the lower double-precision value of the first operand and returns
99/// the difference in the lower 64 bits of the result. The upper 64 bits of
100/// the result are copied from the upper double-precision value of the first
101/// operand.
102///
103/// \headerfile <x86intrin.h>
104///
105/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106///
107/// \param __a
108/// A 128-bit vector of [2 x double] containing the minuend.
109/// \param __b
110/// A 128-bit vector of [2 x double] containing the subtrahend.
111/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112/// difference of the lower 64 bits of both operands. The upper 64 bits are
113/// copied from the upper 64 bits of the first source operand.
114static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115 __m128d __b) {
116 __a[0] -= __b[0];
117 return __a;
118}
119
120/// Subtracts two 128-bit vectors of [2 x double].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125///
126/// \param __a
127/// A 128-bit vector of [2 x double] containing the minuend.
128/// \param __b
129/// A 128-bit vector of [2 x double] containing the subtrahend.
130/// \returns A 128-bit vector of [2 x double] containing the differences between
131/// both operands.
132static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133 __m128d __b) {
134 return (__m128d)((__v2df)__a - (__v2df)__b);
135}
136
137/// Multiplies lower double-precision values in both operands and returns
138/// the product in the lower 64 bits of the result. The upper 64 bits of the
139/// result are copied from the upper double-precision value of the first
140/// operand.
141///
142/// \headerfile <x86intrin.h>
143///
144/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145///
146/// \param __a
147/// A 128-bit vector of [2 x double] containing one of the source operands.
148/// \param __b
149/// A 128-bit vector of [2 x double] containing one of the source operands.
150/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151/// product of the lower 64 bits of both operands. The upper 64 bits are
152/// copied from the upper 64 bits of the first source operand.
153static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154 __m128d __b) {
155 __a[0] *= __b[0];
156 return __a;
157}
158
159/// Multiplies two 128-bit vectors of [2 x double].
160///
161/// \headerfile <x86intrin.h>
162///
163/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164///
165/// \param __a
166/// A 128-bit vector of [2 x double] containing one of the operands.
167/// \param __b
168/// A 128-bit vector of [2 x double] containing one of the operands.
169/// \returns A 128-bit vector of [2 x double] containing the products of both
170/// operands.
171static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172 __m128d __b) {
173 return (__m128d)((__v2df)__a * (__v2df)__b);
174}
175
176/// Divides the lower double-precision value of the first operand by the
177/// lower double-precision value of the second operand and returns the
178/// quotient in the lower 64 bits of the result. The upper 64 bits of the
179/// result are copied from the upper double-precision value of the first
180/// operand.
181///
182/// \headerfile <x86intrin.h>
183///
184/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185///
186/// \param __a
187/// A 128-bit vector of [2 x double] containing the dividend.
188/// \param __b
189/// A 128-bit vector of [2 x double] containing divisor.
190/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191/// quotient of the lower 64 bits of both operands. The upper 64 bits are
192/// copied from the upper 64 bits of the first source operand.
193static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194 __m128d __b) {
195 __a[0] /= __b[0];
196 return __a;
197}
198
199/// Performs an element-by-element division of two 128-bit vectors of
200/// [2 x double].
201///
202/// \headerfile <x86intrin.h>
203///
204/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205///
206/// \param __a
207/// A 128-bit vector of [2 x double] containing the dividend.
208/// \param __b
209/// A 128-bit vector of [2 x double] containing the divisor.
210/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211/// operands.
212static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213 __m128d __b) {
214 return (__m128d)((__v2df)__a / (__v2df)__b);
215}
216
217/// Calculates the square root of the lower double-precision value of
218/// the second operand and returns it in the lower 64 bits of the result.
219/// The upper 64 bits of the result are copied from the upper
220/// double-precision value of the first operand.
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225///
226/// \param __a
227/// A 128-bit vector of [2 x double] containing one of the operands. The
228/// upper 64 bits of this operand are copied to the upper 64 bits of the
229/// result.
230/// \param __b
231/// A 128-bit vector of [2 x double] containing one of the operands. The
232/// square root is calculated using the lower 64 bits of this operand.
233/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234/// square root of the lower 64 bits of operand \a __b, and whose upper 64
235/// bits are copied from the upper 64 bits of operand \a __a.
236static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237 __m128d __b) {
238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239 return __extension__(__m128d){__c[0], __a[1]};
240}
241
242/// Calculates the square root of the each of two values stored in a
243/// 128-bit vector of [2 x double].
244///
245/// \headerfile <x86intrin.h>
246///
247/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248///
249/// \param __a
250/// A 128-bit vector of [2 x double].
251/// \returns A 128-bit vector of [2 x double] containing the square roots of the
252/// values in the operand.
253static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254 return __builtin_ia32_sqrtpd((__v2df)__a);
255}
256
257/// Compares lower 64-bit double-precision values of both operands, and
258/// returns the lesser of the pair of values in the lower 64-bits of the
259/// result. The upper 64 bits of the result are copied from the upper
260/// double-precision value of the first operand.
261///
262/// If either value in a comparison is NaN, returns the value from \a __b.
263///
264/// \headerfile <x86intrin.h>
265///
266/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
267///
268/// \param __a
269/// A 128-bit vector of [2 x double] containing one of the operands. The
270/// lower 64 bits of this operand are used in the comparison.
271/// \param __b
272/// A 128-bit vector of [2 x double] containing one of the operands. The
273/// lower 64 bits of this operand are used in the comparison.
274/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
275/// minimum value between both operands. The upper 64 bits are copied from
276/// the upper 64 bits of the first source operand.
277static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
278 __m128d __b) {
279 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
280}
281
282/// Performs element-by-element comparison of the two 128-bit vectors of
283/// [2 x double] and returns a vector containing the lesser of each pair of
284/// values.
285///
286/// If either value in a comparison is NaN, returns the value from \a __b.
287///
288/// \headerfile <x86intrin.h>
289///
290/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
291///
292/// \param __a
293/// A 128-bit vector of [2 x double] containing one of the operands.
294/// \param __b
295/// A 128-bit vector of [2 x double] containing one of the operands.
296/// \returns A 128-bit vector of [2 x double] containing the minimum values
297/// between both operands.
298static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
299 __m128d __b) {
300 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
301}
302
303/// Compares lower 64-bit double-precision values of both operands, and
304/// returns the greater of the pair of values in the lower 64-bits of the
305/// result. The upper 64 bits of the result are copied from the upper
306/// double-precision value of the first operand.
307///
308/// If either value in a comparison is NaN, returns the value from \a __b.
309///
310/// \headerfile <x86intrin.h>
311///
312/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
313///
314/// \param __a
315/// A 128-bit vector of [2 x double] containing one of the operands. The
316/// lower 64 bits of this operand are used in the comparison.
317/// \param __b
318/// A 128-bit vector of [2 x double] containing one of the operands. The
319/// lower 64 bits of this operand are used in the comparison.
320/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321/// maximum value between both operands. The upper 64 bits are copied from
322/// the upper 64 bits of the first source operand.
323static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
324 __m128d __b) {
325 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
326}
327
328/// Performs element-by-element comparison of the two 128-bit vectors of
329/// [2 x double] and returns a vector containing the greater of each pair
330/// of values.
331///
332/// If either value in a comparison is NaN, returns the value from \a __b.
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
337///
338/// \param __a
339/// A 128-bit vector of [2 x double] containing one of the operands.
340/// \param __b
341/// A 128-bit vector of [2 x double] containing one of the operands.
342/// \returns A 128-bit vector of [2 x double] containing the maximum values
343/// between both operands.
344static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
345 __m128d __b) {
346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
347}
348
349/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
350///
351/// \headerfile <x86intrin.h>
352///
353/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
354///
355/// \param __a
356/// A 128-bit vector of [2 x double] containing one of the source operands.
357/// \param __b
358/// A 128-bit vector of [2 x double] containing one of the source operands.
359/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360/// values between both operands.
361static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
362 __m128d __b) {
363 return (__m128d)((__v2du)__a & (__v2du)__b);
364}
365
366/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
367/// the one's complement of the values contained in the first source operand.
368///
369/// \headerfile <x86intrin.h>
370///
371/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
372///
373/// \param __a
374/// A 128-bit vector of [2 x double] containing the left source operand. The
375/// one's complement of this value is used in the bitwise AND.
376/// \param __b
377/// A 128-bit vector of [2 x double] containing the right source operand.
378/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
379/// values in the second operand and the one's complement of the first
380/// operand.
381static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
382 __m128d __b) {
383 return (__m128d)(~(__v2du)__a & (__v2du)__b);
384}
385
386/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
387///
388/// \headerfile <x86intrin.h>
389///
390/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
391///
392/// \param __a
393/// A 128-bit vector of [2 x double] containing one of the source operands.
394/// \param __b
395/// A 128-bit vector of [2 x double] containing one of the source operands.
396/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
397/// values between both operands.
398static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
399 __m128d __b) {
400 return (__m128d)((__v2du)__a | (__v2du)__b);
401}
402
403/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
404///
405/// \headerfile <x86intrin.h>
406///
407/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
408///
409/// \param __a
410/// A 128-bit vector of [2 x double] containing one of the source operands.
411/// \param __b
412/// A 128-bit vector of [2 x double] containing one of the source operands.
413/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
414/// values between both operands.
415static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
416 __m128d __b) {
417 return (__m128d)((__v2du)__a ^ (__v2du)__b);
418}
419
420/// Compares each of the corresponding double-precision values of the
421/// 128-bit vectors of [2 x double] for equality.
422///
423/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
424/// If either value in a comparison is NaN, returns false.
425///
426/// \headerfile <x86intrin.h>
427///
428/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
429///
430/// \param __a
431/// A 128-bit vector of [2 x double].
432/// \param __b
433/// A 128-bit vector of [2 x double].
434/// \returns A 128-bit vector containing the comparison results.
435static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
436 __m128d __b) {
437 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
438}
439
440/// Compares each of the corresponding double-precision values of the
441/// 128-bit vectors of [2 x double] to determine if the values in the first
442/// operand are less than those in the second operand.
443///
444/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
445/// If either value in a comparison is NaN, returns false.
446///
447/// \headerfile <x86intrin.h>
448///
449/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
450///
451/// \param __a
452/// A 128-bit vector of [2 x double].
453/// \param __b
454/// A 128-bit vector of [2 x double].
455/// \returns A 128-bit vector containing the comparison results.
456static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
457 __m128d __b) {
458 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
459}
460
461/// Compares each of the corresponding double-precision values of the
462/// 128-bit vectors of [2 x double] to determine if the values in the first
463/// operand are less than or equal to those in the second operand.
464///
465/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
466/// If either value in a comparison is NaN, returns false.
467///
468/// \headerfile <x86intrin.h>
469///
470/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
471///
472/// \param __a
473/// A 128-bit vector of [2 x double].
474/// \param __b
475/// A 128-bit vector of [2 x double].
476/// \returns A 128-bit vector containing the comparison results.
477static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
478 __m128d __b) {
479 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
480}
481
482/// Compares each of the corresponding double-precision values of the
483/// 128-bit vectors of [2 x double] to determine if the values in the first
484/// operand are greater than those in the second operand.
485///
486/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
487/// If either value in a comparison is NaN, returns false.
488///
489/// \headerfile <x86intrin.h>
490///
491/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
492///
493/// \param __a
494/// A 128-bit vector of [2 x double].
495/// \param __b
496/// A 128-bit vector of [2 x double].
497/// \returns A 128-bit vector containing the comparison results.
498static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
499 __m128d __b) {
500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
501}
502
503/// Compares each of the corresponding double-precision values of the
504/// 128-bit vectors of [2 x double] to determine if the values in the first
505/// operand are greater than or equal to those in the second operand.
506///
507/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
508/// If either value in a comparison is NaN, returns false.
509///
510/// \headerfile <x86intrin.h>
511///
512/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
513///
514/// \param __a
515/// A 128-bit vector of [2 x double].
516/// \param __b
517/// A 128-bit vector of [2 x double].
518/// \returns A 128-bit vector containing the comparison results.
519static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
520 __m128d __b) {
521 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
522}
523
524/// Compares each of the corresponding double-precision values of the
525/// 128-bit vectors of [2 x double] to determine if the values in the first
526/// operand are ordered with respect to those in the second operand.
527///
528/// A pair of double-precision values are ordered with respect to each
529/// other if neither value is a NaN. Each comparison returns 0x0 for false,
530/// 0xFFFFFFFFFFFFFFFF for true.
531///
532/// \headerfile <x86intrin.h>
533///
534/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
535///
536/// \param __a
537/// A 128-bit vector of [2 x double].
538/// \param __b
539/// A 128-bit vector of [2 x double].
540/// \returns A 128-bit vector containing the comparison results.
541static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
542 __m128d __b) {
543 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
544}
545
546/// Compares each of the corresponding double-precision values of the
547/// 128-bit vectors of [2 x double] to determine if the values in the first
548/// operand are unordered with respect to those in the second operand.
549///
550/// A pair of double-precision values are unordered with respect to each
551/// other if one or both values are NaN. Each comparison returns 0x0 for
552/// false, 0xFFFFFFFFFFFFFFFF for true.
553///
554/// \headerfile <x86intrin.h>
555///
556/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
557/// instruction.
558///
559/// \param __a
560/// A 128-bit vector of [2 x double].
561/// \param __b
562/// A 128-bit vector of [2 x double].
563/// \returns A 128-bit vector containing the comparison results.
564static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
565 __m128d __b) {
566 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
567}
568
569/// Compares each of the corresponding double-precision values of the
570/// 128-bit vectors of [2 x double] to determine if the values in the first
571/// operand are unequal to those in the second operand.
572///
573/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
574/// If either value in a comparison is NaN, returns true.
575///
576/// \headerfile <x86intrin.h>
577///
578/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
579///
580/// \param __a
581/// A 128-bit vector of [2 x double].
582/// \param __b
583/// A 128-bit vector of [2 x double].
584/// \returns A 128-bit vector containing the comparison results.
585static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
586 __m128d __b) {
587 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
588}
589
590/// Compares each of the corresponding double-precision values of the
591/// 128-bit vectors of [2 x double] to determine if the values in the first
592/// operand are not less than those in the second operand.
593///
594/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
595/// If either value in a comparison is NaN, returns true.
596///
597/// \headerfile <x86intrin.h>
598///
599/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
600///
601/// \param __a
602/// A 128-bit vector of [2 x double].
603/// \param __b
604/// A 128-bit vector of [2 x double].
605/// \returns A 128-bit vector containing the comparison results.
606static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
607 __m128d __b) {
608 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
609}
610
611/// Compares each of the corresponding double-precision values of the
612/// 128-bit vectors of [2 x double] to determine if the values in the first
613/// operand are not less than or equal to those in the second operand.
614///
615/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
616/// If either value in a comparison is NaN, returns true.
617///
618/// \headerfile <x86intrin.h>
619///
620/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
621///
622/// \param __a
623/// A 128-bit vector of [2 x double].
624/// \param __b
625/// A 128-bit vector of [2 x double].
626/// \returns A 128-bit vector containing the comparison results.
627static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
628 __m128d __b) {
629 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
630}
631
632/// Compares each of the corresponding double-precision values of the
633/// 128-bit vectors of [2 x double] to determine if the values in the first
634/// operand are not greater than those in the second operand.
635///
636/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
637/// If either value in a comparison is NaN, returns true.
638///
639/// \headerfile <x86intrin.h>
640///
641/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
642///
643/// \param __a
644/// A 128-bit vector of [2 x double].
645/// \param __b
646/// A 128-bit vector of [2 x double].
647/// \returns A 128-bit vector containing the comparison results.
648static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
649 __m128d __b) {
650 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
651}
652
653/// Compares each of the corresponding double-precision values of the
654/// 128-bit vectors of [2 x double] to determine if the values in the first
655/// operand are not greater than or equal to those in the second operand.
656///
657/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658/// If either value in a comparison is NaN, returns true.
659///
660/// \headerfile <x86intrin.h>
661///
662/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
663///
664/// \param __a
665/// A 128-bit vector of [2 x double].
666/// \param __b
667/// A 128-bit vector of [2 x double].
668/// \returns A 128-bit vector containing the comparison results.
669static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
670 __m128d __b) {
671 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
672}
673
674/// Compares the lower double-precision floating-point values in each of
675/// the two 128-bit floating-point vectors of [2 x double] for equality.
676///
677/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
678/// If either value in a comparison is NaN, returns false.
679///
680/// \headerfile <x86intrin.h>
681///
682/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
683///
684/// \param __a
685/// A 128-bit vector of [2 x double]. The lower double-precision value is
686/// compared to the lower double-precision value of \a __b.
687/// \param __b
688/// A 128-bit vector of [2 x double]. The lower double-precision value is
689/// compared to the lower double-precision value of \a __a.
690/// \returns A 128-bit vector. The lower 64 bits contains the comparison
691/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
692static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
693 __m128d __b) {
694 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
695}
696
697/// Compares the lower double-precision floating-point values in each of
698/// the two 128-bit floating-point vectors of [2 x double] to determine if
699/// the value in the first parameter is less than the corresponding value in
700/// the second parameter.
701///
702/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
703/// If either value in a comparison is NaN, returns false.
704///
705/// \headerfile <x86intrin.h>
706///
707/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
708///
709/// \param __a
710/// A 128-bit vector of [2 x double]. The lower double-precision value is
711/// compared to the lower double-precision value of \a __b.
712/// \param __b
713/// A 128-bit vector of [2 x double]. The lower double-precision value is
714/// compared to the lower double-precision value of \a __a.
715/// \returns A 128-bit vector. The lower 64 bits contains the comparison
716/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
717static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
718 __m128d __b) {
719 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
720}
721
722/// Compares the lower double-precision floating-point values in each of
723/// the two 128-bit floating-point vectors of [2 x double] to determine if
724/// the value in the first parameter is less than or equal to the
725/// corresponding value in the second parameter.
726///
727/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
728/// If either value in a comparison is NaN, returns false.
729///
730/// \headerfile <x86intrin.h>
731///
732/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
733///
734/// \param __a
735/// A 128-bit vector of [2 x double]. The lower double-precision value is
736/// compared to the lower double-precision value of \a __b.
737/// \param __b
738/// A 128-bit vector of [2 x double]. The lower double-precision value is
739/// compared to the lower double-precision value of \a __a.
740/// \returns A 128-bit vector. The lower 64 bits contains the comparison
741/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
742static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
743 __m128d __b) {
744 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
745}
746
747/// Compares the lower double-precision floating-point values in each of
748/// the two 128-bit floating-point vectors of [2 x double] to determine if
749/// the value in the first parameter is greater than the corresponding value
750/// in the second parameter.
751///
752/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
753/// If either value in a comparison is NaN, returns false.
754///
755/// \headerfile <x86intrin.h>
756///
757/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
758///
759/// \param __a
760/// A 128-bit vector of [2 x double]. The lower double-precision value is
761/// compared to the lower double-precision value of \a __b.
762/// \param __b
763/// A 128-bit vector of [2 x double]. The lower double-precision value is
764/// compared to the lower double-precision value of \a __a.
765/// \returns A 128-bit vector. The lower 64 bits contains the comparison
766/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
767static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
768 __m128d __b) {
769 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
770 return __extension__(__m128d){__c[0], __a[1]};
771}
772
773/// Compares the lower double-precision floating-point values in each of
774/// the two 128-bit floating-point vectors of [2 x double] to determine if
775/// the value in the first parameter is greater than or equal to the
776/// corresponding value in the second parameter.
777///
778/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
779/// If either value in a comparison is NaN, returns false.
780///
781/// \headerfile <x86intrin.h>
782///
783/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
784///
785/// \param __a
786/// A 128-bit vector of [2 x double]. The lower double-precision value is
787/// compared to the lower double-precision value of \a __b.
788/// \param __b
789/// A 128-bit vector of [2 x double]. The lower double-precision value is
790/// compared to the lower double-precision value of \a __a.
791/// \returns A 128-bit vector. The lower 64 bits contains the comparison
792/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
793static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
794 __m128d __b) {
795 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
796 return __extension__(__m128d){__c[0], __a[1]};
797}
798
799/// Compares the lower double-precision floating-point values in each of
800/// the two 128-bit floating-point vectors of [2 x double] to determine if
801/// the value in the first parameter is ordered with respect to the
802/// corresponding value in the second parameter.
803///
804/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
805/// of double-precision values are ordered with respect to each other if
806/// neither value is a NaN.
807///
808/// \headerfile <x86intrin.h>
809///
810/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
811///
812/// \param __a
813/// A 128-bit vector of [2 x double]. The lower double-precision value is
814/// compared to the lower double-precision value of \a __b.
815/// \param __b
816/// A 128-bit vector of [2 x double]. The lower double-precision value is
817/// compared to the lower double-precision value of \a __a.
818/// \returns A 128-bit vector. The lower 64 bits contains the comparison
819/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
820static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
821 __m128d __b) {
822 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
823}
824
825/// Compares the lower double-precision floating-point values in each of
826/// the two 128-bit floating-point vectors of [2 x double] to determine if
827/// the value in the first parameter is unordered with respect to the
828/// corresponding value in the second parameter.
829///
830/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
831/// of double-precision values are unordered with respect to each other if
832/// one or both values are NaN.
833///
834/// \headerfile <x86intrin.h>
835///
836/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
837/// instruction.
838///
839/// \param __a
840/// A 128-bit vector of [2 x double]. The lower double-precision value is
841/// compared to the lower double-precision value of \a __b.
842/// \param __b
843/// A 128-bit vector of [2 x double]. The lower double-precision value is
844/// compared to the lower double-precision value of \a __a.
845/// \returns A 128-bit vector. The lower 64 bits contains the comparison
846/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
847static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
848 __m128d __b) {
849 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
850}
851
852/// Compares the lower double-precision floating-point values in each of
853/// the two 128-bit floating-point vectors of [2 x double] to determine if
854/// the value in the first parameter is unequal to the corresponding value in
855/// the second parameter.
856///
857/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
858/// If either value in a comparison is NaN, returns true.
859///
860/// \headerfile <x86intrin.h>
861///
862/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
863///
864/// \param __a
865/// A 128-bit vector of [2 x double]. The lower double-precision value is
866/// compared to the lower double-precision value of \a __b.
867/// \param __b
868/// A 128-bit vector of [2 x double]. The lower double-precision value is
869/// compared to the lower double-precision value of \a __a.
870/// \returns A 128-bit vector. The lower 64 bits contains the comparison
871/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
872static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
873 __m128d __b) {
874 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
875}
876
877/// Compares the lower double-precision floating-point values in each of
878/// the two 128-bit floating-point vectors of [2 x double] to determine if
879/// the value in the first parameter is not less than the corresponding
880/// value in the second parameter.
881///
882/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
883/// If either value in a comparison is NaN, returns true.
884///
885/// \headerfile <x86intrin.h>
886///
887/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
888///
889/// \param __a
890/// A 128-bit vector of [2 x double]. The lower double-precision value is
891/// compared to the lower double-precision value of \a __b.
892/// \param __b
893/// A 128-bit vector of [2 x double]. The lower double-precision value is
894/// compared to the lower double-precision value of \a __a.
895/// \returns A 128-bit vector. The lower 64 bits contains the comparison
896/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
897static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
898 __m128d __b) {
899 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
900}
901
902/// Compares the lower double-precision floating-point values in each of
903/// the two 128-bit floating-point vectors of [2 x double] to determine if
904/// the value in the first parameter is not less than or equal to the
905/// corresponding value in the second parameter.
906///
907/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
908/// If either value in a comparison is NaN, returns true.
909///
910/// \headerfile <x86intrin.h>
911///
912/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
913///
914/// \param __a
915/// A 128-bit vector of [2 x double]. The lower double-precision value is
916/// compared to the lower double-precision value of \a __b.
917/// \param __b
918/// A 128-bit vector of [2 x double]. The lower double-precision value is
919/// compared to the lower double-precision value of \a __a.
920/// \returns A 128-bit vector. The lower 64 bits contains the comparison
921/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
922static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
923 __m128d __b) {
924 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
925}
926
927/// Compares the lower double-precision floating-point values in each of
928/// the two 128-bit floating-point vectors of [2 x double] to determine if
929/// the value in the first parameter is not greater than the corresponding
930/// value in the second parameter.
931///
932/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
933/// If either value in a comparison is NaN, returns true.
934///
935/// \headerfile <x86intrin.h>
936///
937/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
938///
939/// \param __a
940/// A 128-bit vector of [2 x double]. The lower double-precision value is
941/// compared to the lower double-precision value of \a __b.
942/// \param __b
943/// A 128-bit vector of [2 x double]. The lower double-precision value is
944/// compared to the lower double-precision value of \a __a.
945/// \returns A 128-bit vector. The lower 64 bits contains the comparison
946/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
947static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
948 __m128d __b) {
949 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
950 return __extension__(__m128d){__c[0], __a[1]};
951}
952
953/// Compares the lower double-precision floating-point values in each of
954/// the two 128-bit floating-point vectors of [2 x double] to determine if
955/// the value in the first parameter is not greater than or equal to the
956/// corresponding value in the second parameter.
957///
958/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
959/// If either value in a comparison is NaN, returns true.
960///
961/// \headerfile <x86intrin.h>
962///
963/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
964///
965/// \param __a
966/// A 128-bit vector of [2 x double]. The lower double-precision value is
967/// compared to the lower double-precision value of \a __b.
968/// \param __b
969/// A 128-bit vector of [2 x double]. The lower double-precision value is
970/// compared to the lower double-precision value of \a __a.
971/// \returns A 128-bit vector. The lower 64 bits contains the comparison
972/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
973static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
974 __m128d __b) {
975 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
976 return __extension__(__m128d){__c[0], __a[1]};
977}
978
979/// Compares the lower double-precision floating-point values in each of
980/// the two 128-bit floating-point vectors of [2 x double] for equality.
981///
982/// The comparison returns 0 for false, 1 for true. If either value in a
983/// comparison is NaN, returns 0.
984///
985/// \headerfile <x86intrin.h>
986///
987/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
988///
989/// \param __a
990/// A 128-bit vector of [2 x double]. The lower double-precision value is
991/// compared to the lower double-precision value of \a __b.
992/// \param __b
993/// A 128-bit vector of [2 x double]. The lower double-precision value is
994/// compared to the lower double-precision value of \a __a.
995/// \returns An integer containing the comparison results.
996static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
997 __m128d __b) {
998 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
999}
1000
1001/// Compares the lower double-precision floating-point values in each of
1002/// the two 128-bit floating-point vectors of [2 x double] to determine if
1003/// the value in the first parameter is less than the corresponding value in
1004/// the second parameter.
1005///
1006/// The comparison returns 0 for false, 1 for true. If either value in a
1007/// comparison is NaN, returns 0.
1008///
1009/// \headerfile <x86intrin.h>
1010///
1011/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1012///
1013/// \param __a
1014/// A 128-bit vector of [2 x double]. The lower double-precision value is
1015/// compared to the lower double-precision value of \a __b.
1016/// \param __b
1017/// A 128-bit vector of [2 x double]. The lower double-precision value is
1018/// compared to the lower double-precision value of \a __a.
1019/// \returns An integer containing the comparison results.
1020static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1021 __m128d __b) {
1022 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1023}
1024
1025/// Compares the lower double-precision floating-point values in each of
1026/// the two 128-bit floating-point vectors of [2 x double] to determine if
1027/// the value in the first parameter is less than or equal to the
1028/// corresponding value in the second parameter.
1029///
1030/// The comparison returns 0 for false, 1 for true. If either value in a
1031/// comparison is NaN, returns 0.
1032///
1033/// \headerfile <x86intrin.h>
1034///
1035/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1036///
1037/// \param __a
1038/// A 128-bit vector of [2 x double]. The lower double-precision value is
1039/// compared to the lower double-precision value of \a __b.
1040/// \param __b
1041/// A 128-bit vector of [2 x double]. The lower double-precision value is
1042/// compared to the lower double-precision value of \a __a.
1043/// \returns An integer containing the comparison results.
1044static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1045 __m128d __b) {
1046 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1047}
1048
1049/// Compares the lower double-precision floating-point values in each of
1050/// the two 128-bit floating-point vectors of [2 x double] to determine if
1051/// the value in the first parameter is greater than the corresponding value
1052/// in the second parameter.
1053///
1054/// The comparison returns 0 for false, 1 for true. If either value in a
1055/// comparison is NaN, returns 0.
1056///
1057/// \headerfile <x86intrin.h>
1058///
1059/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060///
1061/// \param __a
1062/// A 128-bit vector of [2 x double]. The lower double-precision value is
1063/// compared to the lower double-precision value of \a __b.
1064/// \param __b
1065/// A 128-bit vector of [2 x double]. The lower double-precision value is
1066/// compared to the lower double-precision value of \a __a.
1067/// \returns An integer containing the comparison results.
1068static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1069 __m128d __b) {
1070 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1071}
1072
1073/// Compares the lower double-precision floating-point values in each of
1074/// the two 128-bit floating-point vectors of [2 x double] to determine if
1075/// the value in the first parameter is greater than or equal to the
1076/// corresponding value in the second parameter.
1077///
1078/// The comparison returns 0 for false, 1 for true. If either value in a
1079/// comparison is NaN, returns 0.
1080///
1081/// \headerfile <x86intrin.h>
1082///
1083/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1084///
1085/// \param __a
1086/// A 128-bit vector of [2 x double]. The lower double-precision value is
1087/// compared to the lower double-precision value of \a __b.
1088/// \param __b
1089/// A 128-bit vector of [2 x double]. The lower double-precision value is
1090/// compared to the lower double-precision value of \a __a.
1091/// \returns An integer containing the comparison results.
1092static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1093 __m128d __b) {
1094 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1095}
1096
1097/// Compares the lower double-precision floating-point values in each of
1098/// the two 128-bit floating-point vectors of [2 x double] to determine if
1099/// the value in the first parameter is unequal to the corresponding value in
1100/// the second parameter.
1101///
1102/// The comparison returns 0 for false, 1 for true. If either value in a
1103/// comparison is NaN, returns 1.
1104///
1105/// \headerfile <x86intrin.h>
1106///
1107/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1108///
1109/// \param __a
1110/// A 128-bit vector of [2 x double]. The lower double-precision value is
1111/// compared to the lower double-precision value of \a __b.
1112/// \param __b
1113/// A 128-bit vector of [2 x double]. The lower double-precision value is
1114/// compared to the lower double-precision value of \a __a.
1115/// \returns An integer containing the comparison results.
1116static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1117 __m128d __b) {
1118 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1119}
1120
1121/// Compares the lower double-precision floating-point values in each of
1122/// the two 128-bit floating-point vectors of [2 x double] for equality.
1123///
1124/// The comparison returns 0 for false, 1 for true. If either value in a
1125/// comparison is NaN, returns 0.
1126///
1127/// \headerfile <x86intrin.h>
1128///
1129/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1130///
1131/// \param __a
1132/// A 128-bit vector of [2 x double]. The lower double-precision value is
1133/// compared to the lower double-precision value of \a __b.
1134/// \param __b
1135/// A 128-bit vector of [2 x double]. The lower double-precision value is
1136/// compared to the lower double-precision value of \a __a.
1137/// \returns An integer containing the comparison results.
1138static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1139 __m128d __b) {
1140 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1141}
1142
1143/// Compares the lower double-precision floating-point values in each of
1144/// the two 128-bit floating-point vectors of [2 x double] to determine if
1145/// the value in the first parameter is less than the corresponding value in
1146/// the second parameter.
1147///
1148/// The comparison returns 0 for false, 1 for true. If either value in a
1149/// comparison is NaN, returns 0.
1150///
1151/// \headerfile <x86intrin.h>
1152///
1153/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1154///
1155/// \param __a
1156/// A 128-bit vector of [2 x double]. The lower double-precision value is
1157/// compared to the lower double-precision value of \a __b.
1158/// \param __b
1159/// A 128-bit vector of [2 x double]. The lower double-precision value is
1160/// compared to the lower double-precision value of \a __a.
1161/// \returns An integer containing the comparison results.
1162static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1163 __m128d __b) {
1164 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1165}
1166
1167/// Compares the lower double-precision floating-point values in each of
1168/// the two 128-bit floating-point vectors of [2 x double] to determine if
1169/// the value in the first parameter is less than or equal to the
1170/// corresponding value in the second parameter.
1171///
1172/// The comparison returns 0 for false, 1 for true. If either value in a
1173/// comparison is NaN, returns 0.
1174///
1175/// \headerfile <x86intrin.h>
1176///
1177/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1178///
1179/// \param __a
1180/// A 128-bit vector of [2 x double]. The lower double-precision value is
1181/// compared to the lower double-precision value of \a __b.
1182/// \param __b
1183/// A 128-bit vector of [2 x double]. The lower double-precision value is
1184/// compared to the lower double-precision value of \a __a.
1185/// \returns An integer containing the comparison results.
1186static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1187 __m128d __b) {
1188 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1189}
1190
1191/// Compares the lower double-precision floating-point values in each of
1192/// the two 128-bit floating-point vectors of [2 x double] to determine if
1193/// the value in the first parameter is greater than the corresponding value
1194/// in the second parameter.
1195///
1196/// The comparison returns 0 for false, 1 for true. If either value in a
1197/// comparison is NaN, returns 0.
1198///
1199/// \headerfile <x86intrin.h>
1200///
1201/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202///
1203/// \param __a
1204/// A 128-bit vector of [2 x double]. The lower double-precision value is
1205/// compared to the lower double-precision value of \a __b.
1206/// \param __b
1207/// A 128-bit vector of [2 x double]. The lower double-precision value is
1208/// compared to the lower double-precision value of \a __a.
1209/// \returns An integer containing the comparison results.
1210static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1211 __m128d __b) {
1212 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1213}
1214
1215/// Compares the lower double-precision floating-point values in each of
1216/// the two 128-bit floating-point vectors of [2 x double] to determine if
1217/// the value in the first parameter is greater than or equal to the
1218/// corresponding value in the second parameter.
1219///
1220/// The comparison returns 0 for false, 1 for true. If either value in a
1221/// comparison is NaN, returns 0.
1222///
1223/// \headerfile <x86intrin.h>
1224///
1225/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1226///
1227/// \param __a
1228/// A 128-bit vector of [2 x double]. The lower double-precision value is
1229/// compared to the lower double-precision value of \a __b.
1230/// \param __b
1231/// A 128-bit vector of [2 x double]. The lower double-precision value is
1232/// compared to the lower double-precision value of \a __a.
1233/// \returns An integer containing the comparison results.
1234static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1235 __m128d __b) {
1236 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1237}
1238
1239/// Compares the lower double-precision floating-point values in each of
1240/// the two 128-bit floating-point vectors of [2 x double] to determine if
1241/// the value in the first parameter is unequal to the corresponding value in
1242/// the second parameter.
1243///
1244/// The comparison returns 0 for false, 1 for true. If either value in a
1245/// comparison is NaN, returns 1.
1246///
1247/// \headerfile <x86intrin.h>
1248///
1249/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1250///
1251/// \param __a
1252/// A 128-bit vector of [2 x double]. The lower double-precision value is
1253/// compared to the lower double-precision value of \a __b.
1254/// \param __b
1255/// A 128-bit vector of [2 x double]. The lower double-precision value is
1256/// compared to the lower double-precision value of \a __a.
1257/// \returns An integer containing the comparison result.
1258static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1259 __m128d __b) {
1260 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1261}
1262
1263/// Converts the two double-precision floating-point elements of a
1264/// 128-bit vector of [2 x double] into two single-precision floating-point
1265/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1266/// The upper 64 bits of the result vector are set to zero.
1267///
1268/// \headerfile <x86intrin.h>
1269///
1270/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1271///
1272/// \param __a
1273/// A 128-bit vector of [2 x double].
1274/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1275/// converted values. The upper 64 bits are set to zero.
1276static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1277 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1278}
1279
1280/// Converts the lower two single-precision floating-point elements of a
1281/// 128-bit vector of [4 x float] into two double-precision floating-point
1282/// values, returned in a 128-bit vector of [2 x double]. The upper two
1283/// elements of the input vector are unused.
1284///
1285/// \headerfile <x86intrin.h>
1286///
1287/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1288///
1289/// \param __a
1290/// A 128-bit vector of [4 x float]. The lower two single-precision
1291/// floating-point elements are converted to double-precision values. The
1292/// upper two elements are unused.
1293/// \returns A 128-bit vector of [2 x double] containing the converted values.
1294static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1295 return (__m128d) __builtin_convertvector(
1296 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1297}
1298
1299/// Converts the lower two integer elements of a 128-bit vector of
1300/// [4 x i32] into two double-precision floating-point values, returned in a
1301/// 128-bit vector of [2 x double].
1302///
1303/// The upper two elements of the input vector are unused.
1304///
1305/// \headerfile <x86intrin.h>
1306///
1307/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1308///
1309/// \param __a
1310/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1311/// converted to double-precision values.
1312///
1313/// The upper two elements are unused.
1314/// \returns A 128-bit vector of [2 x double] containing the converted values.
1315static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1316 return (__m128d) __builtin_convertvector(
1317 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1318}
1319
1320/// Converts the two double-precision floating-point elements of a
1321/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1322/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1323/// 64 bits of the result vector are set to zero.
1324///
1325/// If a converted value does not fit in a 32-bit integer, raises a
1326/// floating-point invalid exception. If the exception is masked, returns
1327/// the most negative integer.
1328///
1329/// \headerfile <x86intrin.h>
1330///
1331/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1332///
1333/// \param __a
1334/// A 128-bit vector of [2 x double].
1335/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1336/// converted values. The upper 64 bits are set to zero.
1337static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1338 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1339}
1340
1341/// Converts the low-order element of a 128-bit vector of [2 x double]
1342/// into a 32-bit signed integer value.
1343///
1344/// If the converted value does not fit in a 32-bit integer, raises a
1345/// floating-point invalid exception. If the exception is masked, returns
1346/// the most negative integer.
1347///
1348/// \headerfile <x86intrin.h>
1349///
1350/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1351///
1352/// \param __a
1353/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1354/// conversion.
1355/// \returns A 32-bit signed integer containing the converted value.
1356static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1357 return __builtin_ia32_cvtsd2si((__v2df)__a);
1358}
1359
1360/// Converts the lower double-precision floating-point element of a
1361/// 128-bit vector of [2 x double], in the second parameter, into a
1362/// single-precision floating-point value, returned in the lower 32 bits of a
1363/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1364/// copied from the upper 96 bits of the first parameter.
1365///
1366/// \headerfile <x86intrin.h>
1367///
1368/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1369///
1370/// \param __a
1371/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1372/// copied to the upper 96 bits of the result.
1373/// \param __b
1374/// A 128-bit vector of [2 x double]. The lower double-precision
1375/// floating-point element is used in the conversion.
1376/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1377/// converted value from the second parameter. The upper 96 bits are copied
1378/// from the upper 96 bits of the first parameter.
1379static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1380 __m128d __b) {
1381 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1382}
1383
1384/// Converts a 32-bit signed integer value, in the second parameter, into
1385/// a double-precision floating-point value, returned in the lower 64 bits of
1386/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1387/// are copied from the upper 64 bits of the first parameter.
1388///
1389/// \headerfile <x86intrin.h>
1390///
1391/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1392///
1393/// \param __a
1394/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1395/// copied to the upper 64 bits of the result.
1396/// \param __b
1397/// A 32-bit signed integer containing the value to be converted.
1398/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1399/// converted value from the second parameter. The upper 64 bits are copied
1400/// from the upper 64 bits of the first parameter.
1401static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1402 int __b) {
1403 __a[0] = __b;
1404 return __a;
1405}
1406
1407/// Converts the lower single-precision floating-point element of a
1408/// 128-bit vector of [4 x float], in the second parameter, into a
1409/// double-precision floating-point value, returned in the lower 64 bits of
1410/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1411/// are copied from the upper 64 bits of the first parameter.
1412///
1413/// \headerfile <x86intrin.h>
1414///
1415/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1416///
1417/// \param __a
1418/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1419/// copied to the upper 64 bits of the result.
1420/// \param __b
1421/// A 128-bit vector of [4 x float]. The lower single-precision
1422/// floating-point element is used in the conversion.
1423/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1424/// converted value from the second parameter. The upper 64 bits are copied
1425/// from the upper 64 bits of the first parameter.
1426static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1427 __m128 __b) {
1428 __a[0] = __b[0];
1429 return __a;
1430}
1431
1432/// Converts the two double-precision floating-point elements of a
1433/// 128-bit vector of [2 x double] into two signed truncated (rounded
1434/// toward zero) 32-bit integer values, returned in the lower 64 bits
1435/// of a 128-bit vector of [4 x i32].
1436///
1437/// If a converted value does not fit in a 32-bit integer, raises a
1438/// floating-point invalid exception. If the exception is masked, returns
1439/// the most negative integer.
1440///
1441/// \headerfile <x86intrin.h>
1442///
1443/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1444/// instruction.
1445///
1446/// \param __a
1447/// A 128-bit vector of [2 x double].
1448/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1449/// converted values. The upper 64 bits are set to zero.
1450static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1451 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1452}
1453
1454/// Converts the low-order element of a [2 x double] vector into a 32-bit
1455/// signed truncated (rounded toward zero) integer value.
1456///
1457/// If the converted value does not fit in a 32-bit integer, raises a
1458/// floating-point invalid exception. If the exception is masked, returns
1459/// the most negative integer.
1460///
1461/// \headerfile <x86intrin.h>
1462///
1463/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1464/// instruction.
1465///
1466/// \param __a
1467/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1468/// conversion.
1469/// \returns A 32-bit signed integer containing the converted value.
1470static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1471 return __builtin_ia32_cvttsd2si((__v2df)__a);
1472}
1473
1474/// Converts the two double-precision floating-point elements of a
1475/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1476/// returned in a 64-bit vector of [2 x i32].
1477///
1478/// If a converted value does not fit in a 32-bit integer, raises a
1479/// floating-point invalid exception. If the exception is masked, returns
1480/// the most negative integer.
1481///
1482/// \headerfile <x86intrin.h>
1483///
1484/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1485///
1486/// \param __a
1487/// A 128-bit vector of [2 x double].
1488/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1489static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1490 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1491}
1492
1493/// Converts the two double-precision floating-point elements of a
1494/// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1495/// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1496///
1497/// If a converted value does not fit in a 32-bit integer, raises a
1498/// floating-point invalid exception. If the exception is masked, returns
1499/// the most negative integer.
1500///
1501/// \headerfile <x86intrin.h>
1502///
1503/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1504///
1505/// \param __a
1506/// A 128-bit vector of [2 x double].
1507/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1508static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1509 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1510}
1511
1512/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1513/// [2 x i32] into two double-precision floating-point values, returned in a
1514/// 128-bit vector of [2 x double].
1515///
1516/// \headerfile <x86intrin.h>
1517///
1518/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1519///
1520/// \param __a
1521/// A 64-bit vector of [2 x i32].
1522/// \returns A 128-bit vector of [2 x double] containing the converted values.
1523static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1524 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1525}
1526
1527/// Returns the low-order element of a 128-bit vector of [2 x double] as
1528/// a double-precision floating-point value.
1529///
1530/// \headerfile <x86intrin.h>
1531///
1532/// This intrinsic has no corresponding instruction.
1533///
1534/// \param __a
1535/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1536/// \returns A double-precision floating-point value copied from the lower 64
1537/// bits of \a __a.
1538static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1539 return __a[0];
1540}
1541
1542/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1543/// memory location.
1544///
1545/// \headerfile <x86intrin.h>
1546///
1547/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1548///
1549/// \param __dp
1550/// A pointer to a 128-bit memory location. The address of the memory
1551/// location has to be 16-byte aligned.
1552/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1553static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1554 return *(const __m128d *)__dp;
1555}
1556
1557/// Loads a double-precision floating-point value from a specified memory
1558/// location and duplicates it to both vector elements of a 128-bit vector of
1559/// [2 x double].
1560///
1561/// \headerfile <x86intrin.h>
1562///
1563/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1564///
1565/// \param __dp
1566/// A pointer to a memory location containing a double-precision value.
1567/// \returns A 128-bit vector of [2 x double] containing the loaded and
1568/// duplicated values.
1569static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1570 struct __mm_load1_pd_struct {
1571 double __u;
1572 } __attribute__((__packed__, __may_alias__));
1573 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1574 return __extension__(__m128d){__u, __u};
1575}
1576
1577#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1578
1579/// Loads two double-precision values, in reverse order, from an aligned
1580/// memory location into a 128-bit vector of [2 x double].
1581///
1582/// \headerfile <x86intrin.h>
1583///
1584/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1585/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1586/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1587///
1588/// \param __dp
1589/// A 16-byte aligned pointer to an array of double-precision values to be
1590/// loaded in reverse order.
1591/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1592/// values.
1593static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1594 __m128d __u = *(const __m128d *)__dp;
1595 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1596}
1597
1598/// Loads a 128-bit floating-point vector of [2 x double] from an
1599/// unaligned memory location.
1600///
1601/// \headerfile <x86intrin.h>
1602///
1603/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1604///
1605/// \param __dp
1606/// A pointer to a 128-bit memory location. The address of the memory
1607/// location does not have to be aligned.
1608/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1609static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1610 struct __loadu_pd {
1611 __m128d_u __v;
1612 } __attribute__((__packed__, __may_alias__));
1613 return ((const struct __loadu_pd *)__dp)->__v;
1614}
1615
1616/// Loads a 64-bit integer value to the low element of a 128-bit integer
1617/// vector and clears the upper element.
1618///
1619/// \headerfile <x86intrin.h>
1620///
1621/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1622///
1623/// \param __a
1624/// A pointer to a 64-bit memory location. The address of the memory
1625/// location does not have to be aligned.
1626/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1627static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1628 struct __loadu_si64 {
1629 long long __v;
1630 } __attribute__((__packed__, __may_alias__));
1631 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1632 return __extension__(__m128i)(__v2di){__u, 0LL};
1633}
1634
1635/// Loads a 32-bit integer value to the low element of a 128-bit integer
1636/// vector and clears the upper element.
1637///
1638/// \headerfile <x86intrin.h>
1639///
1640/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1641///
1642/// \param __a
1643/// A pointer to a 32-bit memory location. The address of the memory
1644/// location does not have to be aligned.
1645/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1646static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1647 struct __loadu_si32 {
1648 int __v;
1649 } __attribute__((__packed__, __may_alias__));
1650 int __u = ((const struct __loadu_si32 *)__a)->__v;
1651 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1652}
1653
1654/// Loads a 16-bit integer value to the low element of a 128-bit integer
1655/// vector and clears the upper element.
1656///
1657/// \headerfile <x86intrin.h>
1658///
1659/// This intrinsic does not correspond to a specific instruction.
1660///
1661/// \param __a
1662/// A pointer to a 16-bit memory location. The address of the memory
1663/// location does not have to be aligned.
1664/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1665static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1666 struct __loadu_si16 {
1667 short __v;
1668 } __attribute__((__packed__, __may_alias__));
1669 short __u = ((const struct __loadu_si16 *)__a)->__v;
1670 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1671}
1672
1673/// Loads a 64-bit double-precision value to the low element of a
1674/// 128-bit integer vector and clears the upper element.
1675///
1676/// \headerfile <x86intrin.h>
1677///
1678/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1679///
1680/// \param __dp
1681/// A pointer to a memory location containing a double-precision value.
1682/// The address of the memory location does not have to be aligned.
1683/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1684static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1685 struct __mm_load_sd_struct {
1686 double __u;
1687 } __attribute__((__packed__, __may_alias__));
1688 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1689 return __extension__(__m128d){__u, 0};
1690}
1691
1692/// Loads a double-precision value into the high-order bits of a 128-bit
1693/// vector of [2 x double]. The low-order bits are copied from the low-order
1694/// bits of the first operand.
1695///
1696/// \headerfile <x86intrin.h>
1697///
1698/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1699///
1700/// \param __a
1701/// A 128-bit vector of [2 x double]. \n
1702/// Bits [63:0] are written to bits [63:0] of the result.
1703/// \param __dp
1704/// A pointer to a 64-bit memory location containing a double-precision
1705/// floating-point value that is loaded. The loaded value is written to bits
1706/// [127:64] of the result. The address of the memory location does not have
1707/// to be aligned.
1708/// \returns A 128-bit vector of [2 x double] containing the moved values.
1709static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1710 double const *__dp) {
1711 struct __mm_loadh_pd_struct {
1712 double __u;
1713 } __attribute__((__packed__, __may_alias__));
1714 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1715 return __extension__(__m128d){__a[0], __u};
1716}
1717
1718/// Loads a double-precision value into the low-order bits of a 128-bit
1719/// vector of [2 x double]. The high-order bits are copied from the
1720/// high-order bits of the first operand.
1721///
1722/// \headerfile <x86intrin.h>
1723///
1724/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1725///
1726/// \param __a
1727/// A 128-bit vector of [2 x double]. \n
1728/// Bits [127:64] are written to bits [127:64] of the result.
1729/// \param __dp
1730/// A pointer to a 64-bit memory location containing a double-precision
1731/// floating-point value that is loaded. The loaded value is written to bits
1732/// [63:0] of the result. The address of the memory location does not have to
1733/// be aligned.
1734/// \returns A 128-bit vector of [2 x double] containing the moved values.
1735static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1736 double const *__dp) {
1737 struct __mm_loadl_pd_struct {
1738 double __u;
1739 } __attribute__((__packed__, __may_alias__));
1740 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1741 return __extension__(__m128d){__u, __a[1]};
1742}
1743
1744/// Constructs a 128-bit floating-point vector of [2 x double] with
1745/// unspecified content. This could be used as an argument to another
1746/// intrinsic function where the argument is required but the value is not
1747/// actually used.
1748///
1749/// \headerfile <x86intrin.h>
1750///
1751/// This intrinsic has no corresponding instruction.
1752///
1753/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1754/// content.
1755static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1756 return (__m128d)__builtin_ia32_undef128();
1757}
1758
1759/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1760/// 64 bits of the vector are initialized with the specified double-precision
1761/// floating-point value. The upper 64 bits are set to zero.
1762///
1763/// \headerfile <x86intrin.h>
1764///
1765/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1766///
1767/// \param __w
1768/// A double-precision floating-point value used to initialize the lower 64
1769/// bits of the result.
1770/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1771/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1772/// set to zero.
1773static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1774 return __extension__(__m128d){__w, 0};
1775}
1776
1777/// Constructs a 128-bit floating-point vector of [2 x double], with each
1778/// of the two double-precision floating-point vector elements set to the
1779/// specified double-precision floating-point value.
1780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1784///
1785/// \param __w
1786/// A double-precision floating-point value used to initialize each vector
1787/// element of the result.
1788/// \returns An initialized 128-bit floating-point vector of [2 x double].
1789static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1790 return __extension__(__m128d){__w, __w};
1791}
1792
1793/// Constructs a 128-bit floating-point vector of [2 x double], with each
1794/// of the two double-precision floating-point vector elements set to the
1795/// specified double-precision floating-point value.
1796///
1797/// \headerfile <x86intrin.h>
1798///
1799/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1800///
1801/// \param __w
1802/// A double-precision floating-point value used to initialize each vector
1803/// element of the result.
1804/// \returns An initialized 128-bit floating-point vector of [2 x double].
1805static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1806 return _mm_set1_pd(__w);
1807}
1808
1809/// Constructs a 128-bit floating-point vector of [2 x double]
1810/// initialized with the specified double-precision floating-point values.
1811///
1812/// \headerfile <x86intrin.h>
1813///
1814/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1815///
1816/// \param __w
1817/// A double-precision floating-point value used to initialize the upper 64
1818/// bits of the result.
1819/// \param __x
1820/// A double-precision floating-point value used to initialize the lower 64
1821/// bits of the result.
1822/// \returns An initialized 128-bit floating-point vector of [2 x double].
1823static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1824 double __x) {
1825 return __extension__(__m128d){__x, __w};
1826}
1827
1828/// Constructs a 128-bit floating-point vector of [2 x double],
1829/// initialized in reverse order with the specified double-precision
1830/// floating-point values.
1831///
1832/// \headerfile <x86intrin.h>
1833///
1834/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1835///
1836/// \param __w
1837/// A double-precision floating-point value used to initialize the lower 64
1838/// bits of the result.
1839/// \param __x
1840/// A double-precision floating-point value used to initialize the upper 64
1841/// bits of the result.
1842/// \returns An initialized 128-bit floating-point vector of [2 x double].
1843static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1844 double __x) {
1845 return __extension__(__m128d){__w, __x};
1846}
1847
1848/// Constructs a 128-bit floating-point vector of [2 x double]
1849/// initialized to zero.
1850///
1851/// \headerfile <x86intrin.h>
1852///
1853/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1854///
1855/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1856/// all elements set to zero.
1857static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1858 return __extension__(__m128d){0.0, 0.0};
1859}
1860
1861/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1862/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1863/// 64 bits are set to the upper 64 bits of the first parameter.
1864///
1865/// \headerfile <x86intrin.h>
1866///
1867/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1868///
1869/// \param __a
1870/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1871/// upper 64 bits of the result.
1872/// \param __b
1873/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1874/// lower 64 bits of the result.
1875/// \returns A 128-bit vector of [2 x double] containing the moved values.
1876static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1877 __m128d __b) {
1878 __a[0] = __b[0];
1879 return __a;
1880}
1881
1882/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1883/// memory location.
1884///
1885/// \headerfile <x86intrin.h>
1886///
1887/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1888///
1889/// \param __dp
1890/// A pointer to a 64-bit memory location.
1891/// \param __a
1892/// A 128-bit vector of [2 x double] containing the value to be stored.
1893static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1894 __m128d __a) {
1895 struct __mm_store_sd_struct {
1896 double __u;
1897 } __attribute__((__packed__, __may_alias__));
1898 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1899}
1900
1901/// Moves packed double-precision values from a 128-bit vector of
1902/// [2 x double] to a memory location.
1903///
1904/// \headerfile <x86intrin.h>
1905///
1906/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1907///
1908/// \param __dp
1909/// A pointer to an aligned memory location that can store two
1910/// double-precision values.
1911/// \param __a
1912/// A packed 128-bit vector of [2 x double] containing the values to be
1913/// moved.
1914static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1915 __m128d __a) {
1916 *(__m128d *)__dp = __a;
1917}
1918
1919/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1920/// the upper and lower 64 bits of a memory location.
1921///
1922/// \headerfile <x86intrin.h>
1923///
1924/// This intrinsic corresponds to the
1925/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1926///
1927/// \param __dp
1928/// A pointer to a memory location that can store two double-precision
1929/// values.
1930/// \param __a
1931/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1932/// of the values in \a __dp.
1933static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1934 __m128d __a) {
1935 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1936 _mm_store_pd(__dp, __a);
1937}
1938
1939/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1940/// the upper and lower 64 bits of a memory location.
1941///
1942/// \headerfile <x86intrin.h>
1943///
1944/// This intrinsic corresponds to the
1945/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1946///
1947/// \param __dp
1948/// A pointer to a memory location that can store two double-precision
1949/// values.
1950/// \param __a
1951/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1952/// of the values in \a __dp.
1953static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1954 __m128d __a) {
1955 _mm_store1_pd(__dp, __a);
1956}
1957
1958/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1959/// location.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1964///
1965/// \param __dp
1966/// A pointer to a 128-bit memory location. The address of the memory
1967/// location does not have to be aligned.
1968/// \param __a
1969/// A 128-bit vector of [2 x double] containing the values to be stored.
1970static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1971 __m128d __a) {
1972 struct __storeu_pd {
1973 __m128d_u __v;
1974 } __attribute__((__packed__, __may_alias__));
1975 ((struct __storeu_pd *)__dp)->__v = __a;
1976}
1977
1978/// Stores two double-precision values, in reverse order, from a 128-bit
1979/// vector of [2 x double] to a 16-byte aligned memory location.
1980///
1981/// \headerfile <x86intrin.h>
1982///
1983/// This intrinsic corresponds to a shuffling instruction followed by a
1984/// <c> VMOVAPD / MOVAPD </c> instruction.
1985///
1986/// \param __dp
1987/// A pointer to a 16-byte aligned memory location that can store two
1988/// double-precision values.
1989/// \param __a
1990/// A 128-bit vector of [2 x double] containing the values to be reversed and
1991/// stored.
1992static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1993 __m128d __a) {
1994 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1995 *(__m128d *)__dp = __a;
1996}
1997
1998/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1999/// memory location.
2000///
2001/// \headerfile <x86intrin.h>
2002///
2003/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2004///
2005/// \param __dp
2006/// A pointer to a 64-bit memory location.
2007/// \param __a
2008/// A 128-bit vector of [2 x double] containing the value to be stored.
2009static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2010 __m128d __a) {
2011 struct __mm_storeh_pd_struct {
2012 double __u;
2013 } __attribute__((__packed__, __may_alias__));
2014 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2015}
2016
2017/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2018/// memory location.
2019///
2020/// \headerfile <x86intrin.h>
2021///
2022/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2023///
2024/// \param __dp
2025/// A pointer to a 64-bit memory location.
2026/// \param __a
2027/// A 128-bit vector of [2 x double] containing the value to be stored.
2028static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2029 __m128d __a) {
2030 struct __mm_storeh_pd_struct {
2031 double __u;
2032 } __attribute__((__packed__, __may_alias__));
2033 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2034}
2035
2036/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2037/// saving the lower 8 bits of each sum in the corresponding element of a
2038/// 128-bit result vector of [16 x i8].
2039///
2040/// The integer elements of both parameters can be either signed or unsigned.
2041///
2042/// \headerfile <x86intrin.h>
2043///
2044/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2045///
2046/// \param __a
2047/// A 128-bit vector of [16 x i8].
2048/// \param __b
2049/// A 128-bit vector of [16 x i8].
2050/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2051/// parameters.
2052static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2053 __m128i __b) {
2054 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2055}
2056
2057/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2058/// saving the lower 16 bits of each sum in the corresponding element of a
2059/// 128-bit result vector of [8 x i16].
2060///
2061/// The integer elements of both parameters can be either signed or unsigned.
2062///
2063/// \headerfile <x86intrin.h>
2064///
2065/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2066///
2067/// \param __a
2068/// A 128-bit vector of [8 x i16].
2069/// \param __b
2070/// A 128-bit vector of [8 x i16].
2071/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2072/// parameters.
2073static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2074 __m128i __b) {
2075 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2076}
2077
2078/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2079/// saving the lower 32 bits of each sum in the corresponding element of a
2080/// 128-bit result vector of [4 x i32].
2081///
2082/// The integer elements of both parameters can be either signed or unsigned.
2083///
2084/// \headerfile <x86intrin.h>
2085///
2086/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2087///
2088/// \param __a
2089/// A 128-bit vector of [4 x i32].
2090/// \param __b
2091/// A 128-bit vector of [4 x i32].
2092/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2093/// parameters.
2094static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2095 __m128i __b) {
2096 return (__m128i)((__v4su)__a + (__v4su)__b);
2097}
2098
2099/// Adds two signed or unsigned 64-bit integer values, returning the
2100/// lower 64 bits of the sum.
2101///
2102/// \headerfile <x86intrin.h>
2103///
2104/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2105///
2106/// \param __a
2107/// A 64-bit integer.
2108/// \param __b
2109/// A 64-bit integer.
2110/// \returns A 64-bit integer containing the sum of both parameters.
2111static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2112 __m64 __b) {
2113 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2114}
2115
2116/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2117/// saving the lower 64 bits of each sum in the corresponding element of a
2118/// 128-bit result vector of [2 x i64].
2119///
2120/// The integer elements of both parameters can be either signed or unsigned.
2121///
2122/// \headerfile <x86intrin.h>
2123///
2124/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2125///
2126/// \param __a
2127/// A 128-bit vector of [2 x i64].
2128/// \param __b
2129/// A 128-bit vector of [2 x i64].
2130/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2131/// parameters.
2132static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2133 __m128i __b) {
2134 return (__m128i)((__v2du)__a + (__v2du)__b);
2135}
2136
2137/// Adds, with saturation, the corresponding elements of two 128-bit
2138/// signed [16 x i8] vectors, saving each sum in the corresponding element
2139/// of a 128-bit result vector of [16 x i8].
2140///
2141/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2142/// less than 0x80 are saturated to 0x80.
2143///
2144/// \headerfile <x86intrin.h>
2145///
2146/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2147///
2148/// \param __a
2149/// A 128-bit signed [16 x i8] vector.
2150/// \param __b
2151/// A 128-bit signed [16 x i8] vector.
2152/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2153/// both parameters.
2154static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2155 __m128i __b) {
2156 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2157}
2158
2159/// Adds, with saturation, the corresponding elements of two 128-bit
2160/// signed [8 x i16] vectors, saving each sum in the corresponding element
2161/// of a 128-bit result vector of [8 x i16].
2162///
2163/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2164/// less than 0x8000 are saturated to 0x8000.
2165///
2166/// \headerfile <x86intrin.h>
2167///
2168/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2169///
2170/// \param __a
2171/// A 128-bit signed [8 x i16] vector.
2172/// \param __b
2173/// A 128-bit signed [8 x i16] vector.
2174/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2175/// both parameters.
2176static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2177 __m128i __b) {
2178 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2179}
2180
2181/// Adds, with saturation, the corresponding elements of two 128-bit
2182/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2183/// of a 128-bit result vector of [16 x i8].
2184///
2185/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2186/// saturated to 0x00.
2187///
2188/// \headerfile <x86intrin.h>
2189///
2190/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2191///
2192/// \param __a
2193/// A 128-bit unsigned [16 x i8] vector.
2194/// \param __b
2195/// A 128-bit unsigned [16 x i8] vector.
2196/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2197/// of both parameters.
2198static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2199 __m128i __b) {
2200 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2201}
2202
2203/// Adds, with saturation, the corresponding elements of two 128-bit
2204/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2205/// of a 128-bit result vector of [8 x i16].
2206///
2207/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2208/// are saturated to 0x0000.
2209///
2210/// \headerfile <x86intrin.h>
2211///
2212/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2213///
2214/// \param __a
2215/// A 128-bit unsigned [8 x i16] vector.
2216/// \param __b
2217/// A 128-bit unsigned [8 x i16] vector.
2218/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2219/// of both parameters.
2220static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2221 __m128i __b) {
2222 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2223}
2224
2225/// Computes the rounded averages of corresponding elements of two
2226/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2227/// corresponding element of a 128-bit result vector of [16 x i8].
2228///
2229/// \headerfile <x86intrin.h>
2230///
2231/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2232///
2233/// \param __a
2234/// A 128-bit unsigned [16 x i8] vector.
2235/// \param __b
2236/// A 128-bit unsigned [16 x i8] vector.
2237/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2238/// averages of both parameters.
2239static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2240 __m128i __b) {
2241 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2242}
2243
2244/// Computes the rounded averages of corresponding elements of two
2245/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2246/// corresponding element of a 128-bit result vector of [8 x i16].
2247///
2248/// \headerfile <x86intrin.h>
2249///
2250/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2251///
2252/// \param __a
2253/// A 128-bit unsigned [8 x i16] vector.
2254/// \param __b
2255/// A 128-bit unsigned [8 x i16] vector.
2256/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2257/// averages of both parameters.
2258static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2259 __m128i __b) {
2260 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2261}
2262
2263/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2264/// vectors, producing eight intermediate 32-bit signed integer products, and
2265/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2266/// [4 x i32] vector.
2267///
2268/// For example, bits [15:0] of both parameters are multiplied producing a
2269/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2270/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2271/// of the result.
2272///
2273/// \headerfile <x86intrin.h>
2274///
2275/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2276///
2277/// \param __a
2278/// A 128-bit signed [8 x i16] vector.
2279/// \param __b
2280/// A 128-bit signed [8 x i16] vector.
2281/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2282/// of both parameters.
2283static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2284 __m128i __b) {
2285 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2286}
2287
2288/// Compares corresponding elements of two 128-bit signed [8 x i16]
2289/// vectors, saving the greater value from each comparison in the
2290/// corresponding element of a 128-bit result vector of [8 x i16].
2291///
2292/// \headerfile <x86intrin.h>
2293///
2294/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2295///
2296/// \param __a
2297/// A 128-bit signed [8 x i16] vector.
2298/// \param __b
2299/// A 128-bit signed [8 x i16] vector.
2300/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2301/// each comparison.
2302static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2303 __m128i __b) {
2304 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2305}
2306
2307/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2308/// vectors, saving the greater value from each comparison in the
2309/// corresponding element of a 128-bit result vector of [16 x i8].
2310///
2311/// \headerfile <x86intrin.h>
2312///
2313/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2314///
2315/// \param __a
2316/// A 128-bit unsigned [16 x i8] vector.
2317/// \param __b
2318/// A 128-bit unsigned [16 x i8] vector.
2319/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2320/// each comparison.
2321static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2322 __m128i __b) {
2323 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2324}
2325
2326/// Compares corresponding elements of two 128-bit signed [8 x i16]
2327/// vectors, saving the smaller value from each comparison in the
2328/// corresponding element of a 128-bit result vector of [8 x i16].
2329///
2330/// \headerfile <x86intrin.h>
2331///
2332/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2333///
2334/// \param __a
2335/// A 128-bit signed [8 x i16] vector.
2336/// \param __b
2337/// A 128-bit signed [8 x i16] vector.
2338/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2339/// each comparison.
2340static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2341 __m128i __b) {
2342 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2343}
2344
2345/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2346/// vectors, saving the smaller value from each comparison in the
2347/// corresponding element of a 128-bit result vector of [16 x i8].
2348///
2349/// \headerfile <x86intrin.h>
2350///
2351/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2352///
2353/// \param __a
2354/// A 128-bit unsigned [16 x i8] vector.
2355/// \param __b
2356/// A 128-bit unsigned [16 x i8] vector.
2357/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2358/// each comparison.
2359static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2360 __m128i __b) {
2361 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2362}
2363
2364/// Multiplies the corresponding elements of two signed [8 x i16]
2365/// vectors, saving the upper 16 bits of each 32-bit product in the
2366/// corresponding element of a 128-bit signed [8 x i16] result vector.
2367///
2368/// \headerfile <x86intrin.h>
2369///
2370/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2371///
2372/// \param __a
2373/// A 128-bit signed [8 x i16] vector.
2374/// \param __b
2375/// A 128-bit signed [8 x i16] vector.
2376/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2377/// each of the eight 32-bit products.
2378static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2379 __m128i __b) {
2380 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2381}
2382
2383/// Multiplies the corresponding elements of two unsigned [8 x i16]
2384/// vectors, saving the upper 16 bits of each 32-bit product in the
2385/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2386///
2387/// \headerfile <x86intrin.h>
2388///
2389/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2390///
2391/// \param __a
2392/// A 128-bit unsigned [8 x i16] vector.
2393/// \param __b
2394/// A 128-bit unsigned [8 x i16] vector.
2395/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2396/// of each of the eight 32-bit products.
2397static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2398 __m128i __b) {
2399 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2400}
2401
2402/// Multiplies the corresponding elements of two signed [8 x i16]
2403/// vectors, saving the lower 16 bits of each 32-bit product in the
2404/// corresponding element of a 128-bit signed [8 x i16] result vector.
2405///
2406/// \headerfile <x86intrin.h>
2407///
2408/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2409///
2410/// \param __a
2411/// A 128-bit signed [8 x i16] vector.
2412/// \param __b
2413/// A 128-bit signed [8 x i16] vector.
2414/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2415/// each of the eight 32-bit products.
2416static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2417 __m128i __b) {
2418 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2419}
2420
2421/// Multiplies 32-bit unsigned integer values contained in the lower bits
2422/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2423/// product.
2424///
2425/// \headerfile <x86intrin.h>
2426///
2427/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2428///
2429/// \param __a
2430/// A 64-bit integer containing one of the source operands.
2431/// \param __b
2432/// A 64-bit integer containing one of the source operands.
2433/// \returns A 64-bit integer vector containing the product of both operands.
2434static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2435 __m64 __b) {
2436 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2437}
2438
2439/// Multiplies 32-bit unsigned integer values contained in the lower
2440/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2441/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2442///
2443/// \headerfile <x86intrin.h>
2444///
2445/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2446///
2447/// \param __a
2448/// A [2 x i64] vector containing one of the source operands.
2449/// \param __b
2450/// A [2 x i64] vector containing one of the source operands.
2451/// \returns A [2 x i64] vector containing the product of both operands.
2452static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2453 __m128i __b) {
2454 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2455}
2456
2457/// Computes the absolute differences of corresponding 8-bit integer
2458/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2459/// separately sums the second 8 absolute differences. Packs these two
2460/// unsigned 16-bit integer sums into the upper and lower elements of a
2461/// [2 x i64] vector.
2462///
2463/// \headerfile <x86intrin.h>
2464///
2465/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2466///
2467/// \param __a
2468/// A 128-bit integer vector containing one of the source operands.
2469/// \param __b
2470/// A 128-bit integer vector containing one of the source operands.
2471/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2472/// differences between both operands.
2473static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2474 __m128i __b) {
2475 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2476}
2477
2478/// Subtracts the corresponding 8-bit integer values in the operands.
2479///
2480/// \headerfile <x86intrin.h>
2481///
2482/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2483///
2484/// \param __a
2485/// A 128-bit integer vector containing the minuends.
2486/// \param __b
2487/// A 128-bit integer vector containing the subtrahends.
2488/// \returns A 128-bit integer vector containing the differences of the values
2489/// in the operands.
2490static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2491 __m128i __b) {
2492 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2493}
2494
2495/// Subtracts the corresponding 16-bit integer values in the operands.
2496///
2497/// \headerfile <x86intrin.h>
2498///
2499/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2500///
2501/// \param __a
2502/// A 128-bit integer vector containing the minuends.
2503/// \param __b
2504/// A 128-bit integer vector containing the subtrahends.
2505/// \returns A 128-bit integer vector containing the differences of the values
2506/// in the operands.
2507static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2508 __m128i __b) {
2509 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2510}
2511
2512/// Subtracts the corresponding 32-bit integer values in the operands.
2513///
2514/// \headerfile <x86intrin.h>
2515///
2516/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2517///
2518/// \param __a
2519/// A 128-bit integer vector containing the minuends.
2520/// \param __b
2521/// A 128-bit integer vector containing the subtrahends.
2522/// \returns A 128-bit integer vector containing the differences of the values
2523/// in the operands.
2524static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2525 __m128i __b) {
2526 return (__m128i)((__v4su)__a - (__v4su)__b);
2527}
2528
2529/// Subtracts signed or unsigned 64-bit integer values and writes the
2530/// difference to the corresponding bits in the destination.
2531///
2532/// \headerfile <x86intrin.h>
2533///
2534/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2535///
2536/// \param __a
2537/// A 64-bit integer vector containing the minuend.
2538/// \param __b
2539/// A 64-bit integer vector containing the subtrahend.
2540/// \returns A 64-bit integer vector containing the difference of the values in
2541/// the operands.
2542static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2543 __m64 __b) {
2544 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2545}
2546
2547/// Subtracts the corresponding elements of two [2 x i64] vectors.
2548///
2549/// \headerfile <x86intrin.h>
2550///
2551/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2552///
2553/// \param __a
2554/// A 128-bit integer vector containing the minuends.
2555/// \param __b
2556/// A 128-bit integer vector containing the subtrahends.
2557/// \returns A 128-bit integer vector containing the differences of the values
2558/// in the operands.
2559static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2560 __m128i __b) {
2561 return (__m128i)((__v2du)__a - (__v2du)__b);
2562}
2563
2564/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2565/// the input and returns the differences in the corresponding bytes in the
2566/// destination.
2567///
2568/// Differences greater than 0x7F are saturated to 0x7F, and differences
2569/// less than 0x80 are saturated to 0x80.
2570///
2571/// \headerfile <x86intrin.h>
2572///
2573/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2574///
2575/// \param __a
2576/// A 128-bit integer vector containing the minuends.
2577/// \param __b
2578/// A 128-bit integer vector containing the subtrahends.
2579/// \returns A 128-bit integer vector containing the differences of the values
2580/// in the operands.
2581static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2582 __m128i __b) {
2583 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2584}
2585
2586/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2587/// the input and returns the differences in the corresponding bytes in the
2588/// destination.
2589///
2590/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2591/// than 0x8000 are saturated to 0x8000.
2592///
2593/// \headerfile <x86intrin.h>
2594///
2595/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2596///
2597/// \param __a
2598/// A 128-bit integer vector containing the minuends.
2599/// \param __b
2600/// A 128-bit integer vector containing the subtrahends.
2601/// \returns A 128-bit integer vector containing the differences of the values
2602/// in the operands.
2603static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2604 __m128i __b) {
2605 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2606}
2607
2608/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2609/// the input and returns the differences in the corresponding bytes in the
2610/// destination.
2611///
2612/// Differences less than 0x00 are saturated to 0x00.
2613///
2614/// \headerfile <x86intrin.h>
2615///
2616/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2617///
2618/// \param __a
2619/// A 128-bit integer vector containing the minuends.
2620/// \param __b
2621/// A 128-bit integer vector containing the subtrahends.
2622/// \returns A 128-bit integer vector containing the unsigned integer
2623/// differences of the values in the operands.
2624static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2625 __m128i __b) {
2626 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2627}
2628
2629/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2630/// the input and returns the differences in the corresponding bytes in the
2631/// destination.
2632///
2633/// Differences less than 0x0000 are saturated to 0x0000.
2634///
2635/// \headerfile <x86intrin.h>
2636///
2637/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2638///
2639/// \param __a
2640/// A 128-bit integer vector containing the minuends.
2641/// \param __b
2642/// A 128-bit integer vector containing the subtrahends.
2643/// \returns A 128-bit integer vector containing the unsigned integer
2644/// differences of the values in the operands.
2645static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2646 __m128i __b) {
2647 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2648}
2649
2650/// Performs a bitwise AND of two 128-bit integer vectors.
2651///
2652/// \headerfile <x86intrin.h>
2653///
2654/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2655///
2656/// \param __a
2657/// A 128-bit integer vector containing one of the source operands.
2658/// \param __b
2659/// A 128-bit integer vector containing one of the source operands.
2660/// \returns A 128-bit integer vector containing the bitwise AND of the values
2661/// in both operands.
2662static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2663 __m128i __b) {
2664 return (__m128i)((__v2du)__a & (__v2du)__b);
2665}
2666
2667/// Performs a bitwise AND of two 128-bit integer vectors, using the
2668/// one's complement of the values contained in the first source operand.
2669///
2670/// \headerfile <x86intrin.h>
2671///
2672/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2673///
2674/// \param __a
2675/// A 128-bit vector containing the left source operand. The one's complement
2676/// of this value is used in the bitwise AND.
2677/// \param __b
2678/// A 128-bit vector containing the right source operand.
2679/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2680/// complement of the first operand and the values in the second operand.
2681static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2682 __m128i __b) {
2683 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2684}
2685/// Performs a bitwise OR of two 128-bit integer vectors.
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2690///
2691/// \param __a
2692/// A 128-bit integer vector containing one of the source operands.
2693/// \param __b
2694/// A 128-bit integer vector containing one of the source operands.
2695/// \returns A 128-bit integer vector containing the bitwise OR of the values
2696/// in both operands.
2697static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2698 __m128i __b) {
2699 return (__m128i)((__v2du)__a | (__v2du)__b);
2700}
2701
2702/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2703///
2704/// \headerfile <x86intrin.h>
2705///
2706/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2707///
2708/// \param __a
2709/// A 128-bit integer vector containing one of the source operands.
2710/// \param __b
2711/// A 128-bit integer vector containing one of the source operands.
2712/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2713/// values in both operands.
2714static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2715 __m128i __b) {
2716 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2717}
2718
2719/// Left-shifts the 128-bit integer vector operand by the specified
2720/// number of bytes. Low-order bits are cleared.
2721///
2722/// \headerfile <x86intrin.h>
2723///
2724/// \code
2725/// __m128i _mm_slli_si128(__m128i a, const int imm);
2726/// \endcode
2727///
2728/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2729///
2730/// \param a
2731/// A 128-bit integer vector containing the source operand.
2732/// \param imm
2733/// An immediate value specifying the number of bytes to left-shift operand
2734/// \a a.
2735/// \returns A 128-bit integer vector containing the left-shifted value.
2736#define _mm_slli_si128(a, imm) \
2737 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2738 (int)(imm)))
2739
2740#define _mm_bslli_si128(a, imm) \
2741 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2742 (int)(imm)))
2743
2744/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2745/// by the specified number of bits. Low-order bits are cleared.
2746///
2747/// \headerfile <x86intrin.h>
2748///
2749/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2750///
2751/// \param __a
2752/// A 128-bit integer vector containing the source operand.
2753/// \param __count
2754/// An integer value specifying the number of bits to left-shift each value
2755/// in operand \a __a.
2756/// \returns A 128-bit integer vector containing the left-shifted values.
2757static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2758 int __count) {
2759 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2760}
2761
2762/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2763/// by the specified number of bits. Low-order bits are cleared.
2764///
2765/// \headerfile <x86intrin.h>
2766///
2767/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2768///
2769/// \param __a
2770/// A 128-bit integer vector containing the source operand.
2771/// \param __count
2772/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2773/// to left-shift each value in operand \a __a.
2774/// \returns A 128-bit integer vector containing the left-shifted values.
2775static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2776 __m128i __count) {
2777 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2778}
2779
2780/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2781/// by the specified number of bits. Low-order bits are cleared.
2782///
2783/// \headerfile <x86intrin.h>
2784///
2785/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2786///
2787/// \param __a
2788/// A 128-bit integer vector containing the source operand.
2789/// \param __count
2790/// An integer value specifying the number of bits to left-shift each value
2791/// in operand \a __a.
2792/// \returns A 128-bit integer vector containing the left-shifted values.
2793static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2794 int __count) {
2795 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2796}
2797
2798/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2799/// by the specified number of bits. Low-order bits are cleared.
2800///
2801/// \headerfile <x86intrin.h>
2802///
2803/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2804///
2805/// \param __a
2806/// A 128-bit integer vector containing the source operand.
2807/// \param __count
2808/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2809/// to left-shift each value in operand \a __a.
2810/// \returns A 128-bit integer vector containing the left-shifted values.
2811static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2812 __m128i __count) {
2813 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2814}
2815
2816/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2817/// by the specified number of bits. Low-order bits are cleared.
2818///
2819/// \headerfile <x86intrin.h>
2820///
2821/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2822///
2823/// \param __a
2824/// A 128-bit integer vector containing the source operand.
2825/// \param __count
2826/// An integer value specifying the number of bits to left-shift each value
2827/// in operand \a __a.
2828/// \returns A 128-bit integer vector containing the left-shifted values.
2829static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2830 int __count) {
2831 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2832}
2833
2834/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2835/// by the specified number of bits. Low-order bits are cleared.
2836///
2837/// \headerfile <x86intrin.h>
2838///
2839/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2840///
2841/// \param __a
2842/// A 128-bit integer vector containing the source operand.
2843/// \param __count
2844/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2845/// to left-shift each value in operand \a __a.
2846/// \returns A 128-bit integer vector containing the left-shifted values.
2847static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2848 __m128i __count) {
2849 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2850}
2851
2852/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2853/// by the specified number of bits. High-order bits are filled with the sign
2854/// bit of the initial value.
2855///
2856/// \headerfile <x86intrin.h>
2857///
2858/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2859///
2860/// \param __a
2861/// A 128-bit integer vector containing the source operand.
2862/// \param __count
2863/// An integer value specifying the number of bits to right-shift each value
2864/// in operand \a __a.
2865/// \returns A 128-bit integer vector containing the right-shifted values.
2866static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2867 int __count) {
2868 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2869}
2870
2871/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2872/// by the specified number of bits. High-order bits are filled with the sign
2873/// bit of the initial value.
2874///
2875/// \headerfile <x86intrin.h>
2876///
2877/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2878///
2879/// \param __a
2880/// A 128-bit integer vector containing the source operand.
2881/// \param __count
2882/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2883/// to right-shift each value in operand \a __a.
2884/// \returns A 128-bit integer vector containing the right-shifted values.
2885static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2886 __m128i __count) {
2887 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2888}
2889
2890/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2891/// by the specified number of bits. High-order bits are filled with the sign
2892/// bit of the initial value.
2893///
2894/// \headerfile <x86intrin.h>
2895///
2896/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2897///
2898/// \param __a
2899/// A 128-bit integer vector containing the source operand.
2900/// \param __count
2901/// An integer value specifying the number of bits to right-shift each value
2902/// in operand \a __a.
2903/// \returns A 128-bit integer vector containing the right-shifted values.
2904static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2905 int __count) {
2906 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2907}
2908
2909/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2910/// by the specified number of bits. High-order bits are filled with the sign
2911/// bit of the initial value.
2912///
2913/// \headerfile <x86intrin.h>
2914///
2915/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2916///
2917/// \param __a
2918/// A 128-bit integer vector containing the source operand.
2919/// \param __count
2920/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2921/// to right-shift each value in operand \a __a.
2922/// \returns A 128-bit integer vector containing the right-shifted values.
2923static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2924 __m128i __count) {
2925 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2926}
2927
2928/// Right-shifts the 128-bit integer vector operand by the specified
2929/// number of bytes. High-order bits are cleared.
2930///
2931/// \headerfile <x86intrin.h>
2932///
2933/// \code
2934/// __m128i _mm_srli_si128(__m128i a, const int imm);
2935/// \endcode
2936///
2937/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2938///
2939/// \param a
2940/// A 128-bit integer vector containing the source operand.
2941/// \param imm
2942/// An immediate value specifying the number of bytes to right-shift operand
2943/// \a a.
2944/// \returns A 128-bit integer vector containing the right-shifted value.
2945#define _mm_srli_si128(a, imm) \
2946 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2947 (int)(imm)))
2948
2949#define _mm_bsrli_si128(a, imm) \
2950 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2951 (int)(imm)))
2952
2953/// Right-shifts each of 16-bit values in the 128-bit integer vector
2954/// operand by the specified number of bits. High-order bits are cleared.
2955///
2956/// \headerfile <x86intrin.h>
2957///
2958/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2959///
2960/// \param __a
2961/// A 128-bit integer vector containing the source operand.
2962/// \param __count
2963/// An integer value specifying the number of bits to right-shift each value
2964/// in operand \a __a.
2965/// \returns A 128-bit integer vector containing the right-shifted values.
2966static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2967 int __count) {
2968 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2969}
2970
2971/// Right-shifts each of 16-bit values in the 128-bit integer vector
2972/// operand by the specified number of bits. High-order bits are cleared.
2973///
2974/// \headerfile <x86intrin.h>
2975///
2976/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2977///
2978/// \param __a
2979/// A 128-bit integer vector containing the source operand.
2980/// \param __count
2981/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2982/// to right-shift each value in operand \a __a.
2983/// \returns A 128-bit integer vector containing the right-shifted values.
2984static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2985 __m128i __count) {
2986 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2987}
2988
2989/// Right-shifts each of 32-bit values in the 128-bit integer vector
2990/// operand by the specified number of bits. High-order bits are cleared.
2991///
2992/// \headerfile <x86intrin.h>
2993///
2994/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2995///
2996/// \param __a
2997/// A 128-bit integer vector containing the source operand.
2998/// \param __count
2999/// An integer value specifying the number of bits to right-shift each value
3000/// in operand \a __a.
3001/// \returns A 128-bit integer vector containing the right-shifted values.
3002static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3003 int __count) {
3004 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3005}
3006
3007/// Right-shifts each of 32-bit values in the 128-bit integer vector
3008/// operand by the specified number of bits. High-order bits are cleared.
3009///
3010/// \headerfile <x86intrin.h>
3011///
3012/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3013///
3014/// \param __a
3015/// A 128-bit integer vector containing the source operand.
3016/// \param __count
3017/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3018/// to right-shift each value in operand \a __a.
3019/// \returns A 128-bit integer vector containing the right-shifted values.
3020static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3021 __m128i __count) {
3022 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3023}
3024
3025/// Right-shifts each of 64-bit values in the 128-bit integer vector
3026/// operand by the specified number of bits. High-order bits are cleared.
3027///
3028/// \headerfile <x86intrin.h>
3029///
3030/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3031///
3032/// \param __a
3033/// A 128-bit integer vector containing the source operand.
3034/// \param __count
3035/// An integer value specifying the number of bits to right-shift each value
3036/// in operand \a __a.
3037/// \returns A 128-bit integer vector containing the right-shifted values.
3038static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3039 int __count) {
3040 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3041}
3042
3043/// Right-shifts each of 64-bit values in the 128-bit integer vector
3044/// operand by the specified number of bits. High-order bits are cleared.
3045///
3046/// \headerfile <x86intrin.h>
3047///
3048/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3049///
3050/// \param __a
3051/// A 128-bit integer vector containing the source operand.
3052/// \param __count
3053/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3054/// to right-shift each value in operand \a __a.
3055/// \returns A 128-bit integer vector containing the right-shifted values.
3056static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3057 __m128i __count) {
3058 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3059}
3060
3061/// Compares each of the corresponding 8-bit values of the 128-bit
3062/// integer vectors for equality.
3063///
3064/// Each comparison returns 0x0 for false, 0xFF for true.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3069///
3070/// \param __a
3071/// A 128-bit integer vector.
3072/// \param __b
3073/// A 128-bit integer vector.
3074/// \returns A 128-bit integer vector containing the comparison results.
3075static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3076 __m128i __b) {
3077 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3078}
3079
3080/// Compares each of the corresponding 16-bit values of the 128-bit
3081/// integer vectors for equality.
3082///
3083/// Each comparison returns 0x0 for false, 0xFFFF for true.
3084///
3085/// \headerfile <x86intrin.h>
3086///
3087/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3088///
3089/// \param __a
3090/// A 128-bit integer vector.
3091/// \param __b
3092/// A 128-bit integer vector.
3093/// \returns A 128-bit integer vector containing the comparison results.
3094static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3095 __m128i __b) {
3096 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3097}
3098
3099/// Compares each of the corresponding 32-bit values of the 128-bit
3100/// integer vectors for equality.
3101///
3102/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3103///
3104/// \headerfile <x86intrin.h>
3105///
3106/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3107///
3108/// \param __a
3109/// A 128-bit integer vector.
3110/// \param __b
3111/// A 128-bit integer vector.
3112/// \returns A 128-bit integer vector containing the comparison results.
3113static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3114 __m128i __b) {
3115 return (__m128i)((__v4si)__a == (__v4si)__b);
3116}
3117
3118/// Compares each of the corresponding signed 8-bit values of the 128-bit
3119/// integer vectors to determine if the values in the first operand are
3120/// greater than those in the second operand.
3121///
3122/// Each comparison returns 0x0 for false, 0xFF for true.
3123///
3124/// \headerfile <x86intrin.h>
3125///
3126/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3127///
3128/// \param __a
3129/// A 128-bit integer vector.
3130/// \param __b
3131/// A 128-bit integer vector.
3132/// \returns A 128-bit integer vector containing the comparison results.
3133static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3134 __m128i __b) {
3135 /* This function always performs a signed comparison, but __v16qi is a char
3136 which may be signed or unsigned, so use __v16qs. */
3137 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3138}
3139
3140/// Compares each of the corresponding signed 16-bit values of the
3141/// 128-bit integer vectors to determine if the values in the first operand
3142/// are greater than those in the second operand.
3143///
3144/// Each comparison returns 0x0 for false, 0xFFFF for true.
3145///
3146/// \headerfile <x86intrin.h>
3147///
3148/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3149///
3150/// \param __a
3151/// A 128-bit integer vector.
3152/// \param __b
3153/// A 128-bit integer vector.
3154/// \returns A 128-bit integer vector containing the comparison results.
3155static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3156 __m128i __b) {
3157 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3158}
3159
3160/// Compares each of the corresponding signed 32-bit values of the
3161/// 128-bit integer vectors to determine if the values in the first operand
3162/// are greater than those in the second operand.
3163///
3164/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3165///
3166/// \headerfile <x86intrin.h>
3167///
3168/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3169///
3170/// \param __a
3171/// A 128-bit integer vector.
3172/// \param __b
3173/// A 128-bit integer vector.
3174/// \returns A 128-bit integer vector containing the comparison results.
3175static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3176 __m128i __b) {
3177 return (__m128i)((__v4si)__a > (__v4si)__b);
3178}
3179
3180/// Compares each of the corresponding signed 8-bit values of the 128-bit
3181/// integer vectors to determine if the values in the first operand are less
3182/// than those in the second operand.
3183///
3184/// Each comparison returns 0x0 for false, 0xFF for true.
3185///
3186/// \headerfile <x86intrin.h>
3187///
3188/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3189///
3190/// \param __a
3191/// A 128-bit integer vector.
3192/// \param __b
3193/// A 128-bit integer vector.
3194/// \returns A 128-bit integer vector containing the comparison results.
3195static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3196 __m128i __b) {
3197 return _mm_cmpgt_epi8(__b, __a);
3198}
3199
3200/// Compares each of the corresponding signed 16-bit values of the
3201/// 128-bit integer vectors to determine if the values in the first operand
3202/// are less than those in the second operand.
3203///
3204/// Each comparison returns 0x0 for false, 0xFFFF for true.
3205///
3206/// \headerfile <x86intrin.h>
3207///
3208/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3209///
3210/// \param __a
3211/// A 128-bit integer vector.
3212/// \param __b
3213/// A 128-bit integer vector.
3214/// \returns A 128-bit integer vector containing the comparison results.
3215static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3216 __m128i __b) {
3217 return _mm_cmpgt_epi16(__b, __a);
3218}
3219
3220/// Compares each of the corresponding signed 32-bit values of the
3221/// 128-bit integer vectors to determine if the values in the first operand
3222/// are less than those in the second operand.
3223///
3224/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3225///
3226/// \headerfile <x86intrin.h>
3227///
3228/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3229///
3230/// \param __a
3231/// A 128-bit integer vector.
3232/// \param __b
3233/// A 128-bit integer vector.
3234/// \returns A 128-bit integer vector containing the comparison results.
3235static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3236 __m128i __b) {
3237 return _mm_cmpgt_epi32(__b, __a);
3238}
3239
3240#ifdef __x86_64__
3241/// Converts a 64-bit signed integer value from the second operand into a
3242/// double-precision value and returns it in the lower element of a [2 x
3243/// double] vector; the upper element of the returned vector is copied from
3244/// the upper element of the first operand.
3245///
3246/// \headerfile <x86intrin.h>
3247///
3248/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3249///
3250/// \param __a
3251/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3252/// copied to the upper 64 bits of the destination.
3253/// \param __b
3254/// A 64-bit signed integer operand containing the value to be converted.
3255/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3256/// converted value of the second operand. The upper 64 bits are copied from
3257/// the upper 64 bits of the first operand.
3258static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3259 long long __b) {
3260 __a[0] = __b;
3261 return __a;
3262}
3263
3264/// Converts the first (lower) element of a vector of [2 x double] into a
3265/// 64-bit signed integer value.
3266///
3267/// If the converted value does not fit in a 64-bit integer, raises a
3268/// floating-point invalid exception. If the exception is masked, returns
3269/// the most negative integer.
3270///
3271/// \headerfile <x86intrin.h>
3272///
3273/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3274///
3275/// \param __a
3276/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3277/// conversion.
3278/// \returns A 64-bit signed integer containing the converted value.
3279static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3280 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3281}
3282
3283/// Converts the first (lower) element of a vector of [2 x double] into a
3284/// 64-bit signed truncated (rounded toward zero) integer value.
3285///
3286/// If a converted value does not fit in a 64-bit integer, raises a
3287/// floating-point invalid exception. If the exception is masked, returns
3288/// the most negative integer.
3289///
3290/// \headerfile <x86intrin.h>
3291///
3292/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3293/// instruction.
3294///
3295/// \param __a
3296/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3297/// conversion.
3298/// \returns A 64-bit signed integer containing the converted value.
3299static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3300 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3301}
3302#endif
3303
3304/// Converts a vector of [4 x i32] into a vector of [4 x float].
3305///
3306/// \headerfile <x86intrin.h>
3307///
3308/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3309///
3310/// \param __a
3311/// A 128-bit integer vector.
3312/// \returns A 128-bit vector of [4 x float] containing the converted values.
3313static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3314 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3315}
3316
3317/// Converts a vector of [4 x float] into a vector of [4 x i32].
3318///
3319/// If a converted value does not fit in a 32-bit integer, raises a
3320/// floating-point invalid exception. If the exception is masked, returns
3321/// the most negative integer.
3322///
3323/// \headerfile <x86intrin.h>
3324///
3325/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3326///
3327/// \param __a
3328/// A 128-bit vector of [4 x float].
3329/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3330/// values.
3331static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3332 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3333}
3334
3335/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3336/// zero) 32-bit integers, returned in a vector of [4 x i32].
3337///
3338/// If a converted value does not fit in a 32-bit integer, raises a
3339/// floating-point invalid exception. If the exception is masked, returns
3340/// the most negative integer.
3341///
3342/// \headerfile <x86intrin.h>
3343///
3344/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3345/// instruction.
3346///
3347/// \param __a
3348/// A 128-bit vector of [4 x float].
3349/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3350static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3351 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3352}
3353
3354/// Returns a vector of [4 x i32] where the lowest element is the input
3355/// operand and the remaining elements are zero.
3356///
3357/// \headerfile <x86intrin.h>
3358///
3359/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3360///
3361/// \param __a
3362/// A 32-bit signed integer operand.
3363/// \returns A 128-bit vector of [4 x i32].
3364static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3365 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3366}
3367
3368/// Returns a vector of [2 x i64] where the lower element is the input
3369/// operand and the upper element is zero.
3370///
3371/// \headerfile <x86intrin.h>
3372///
3373/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3374/// in 64-bit mode.
3375///
3376/// \param __a
3377/// A 64-bit signed integer operand containing the value to be converted.
3378/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3379static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3380 return __extension__(__m128i)(__v2di){__a, 0};
3381}
3382
3383/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3384/// 32-bit signed integer value.
3385///
3386/// \headerfile <x86intrin.h>
3387///
3388/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3389///
3390/// \param __a
3391/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3392/// destination.
3393/// \returns A 32-bit signed integer containing the moved value.
3394static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3395 __v4si __b = (__v4si)__a;
3396 return __b[0];
3397}
3398
3399/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3400/// 64-bit signed integer value.
3401///
3402/// \headerfile <x86intrin.h>
3403///
3404/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3405///
3406/// \param __a
3407/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3408/// destination.
3409/// \returns A 64-bit signed integer containing the moved value.
3410static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3411 return __a[0];
3412}
3413
3414/// Moves packed integer values from an aligned 128-bit memory location
3415/// to elements in a 128-bit integer vector.
3416///
3417/// \headerfile <x86intrin.h>
3418///
3419/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3420///
3421/// \param __p
3422/// An aligned pointer to a memory location containing integer values.
3423/// \returns A 128-bit integer vector containing the moved values.
3424static __inline__ __m128i __DEFAULT_FN_ATTRS
3425_mm_load_si128(__m128i const *__p) {
3426 return *__p;
3427}
3428
3429/// Moves packed integer values from an unaligned 128-bit memory location
3430/// to elements in a 128-bit integer vector.
3431///
3432/// \headerfile <x86intrin.h>
3433///
3434/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3435///
3436/// \param __p
3437/// A pointer to a memory location containing integer values.
3438/// \returns A 128-bit integer vector containing the moved values.
3439static __inline__ __m128i __DEFAULT_FN_ATTRS
3440_mm_loadu_si128(__m128i_u const *__p) {
3441 struct __loadu_si128 {
3442 __m128i_u __v;
3443 } __attribute__((__packed__, __may_alias__));
3444 return ((const struct __loadu_si128 *)__p)->__v;
3445}
3446
3447/// Returns a vector of [2 x i64] where the lower element is taken from
3448/// the lower element of the operand, and the upper element is zero.
3449///
3450/// \headerfile <x86intrin.h>
3451///
3452/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3453///
3454/// \param __p
3455/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3456/// the destination.
3457/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3458/// moved value. The higher order bits are cleared.
3459static __inline__ __m128i __DEFAULT_FN_ATTRS
3460_mm_loadl_epi64(__m128i_u const *__p) {
3461 struct __mm_loadl_epi64_struct {
3462 long long __u;
3463 } __attribute__((__packed__, __may_alias__));
3464 return __extension__(__m128i){
3465 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3466}
3467
3468/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3469/// This could be used as an argument to another intrinsic function where the
3470/// argument is required but the value is not actually used.
3471///
3472/// \headerfile <x86intrin.h>
3473///
3474/// This intrinsic has no corresponding instruction.
3475///
3476/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3477static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3478 return (__m128i)__builtin_ia32_undef128();
3479}
3480
3481/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3482/// the specified 64-bit integer values.
3483///
3484/// \headerfile <x86intrin.h>
3485///
3486/// This intrinsic is a utility function and does not correspond to a specific
3487/// instruction.
3488///
3489/// \param __q1
3490/// A 64-bit integer value used to initialize the upper 64 bits of the
3491/// destination vector of [2 x i64].
3492/// \param __q0
3493/// A 64-bit integer value used to initialize the lower 64 bits of the
3494/// destination vector of [2 x i64].
3495/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3496/// provided in the operands.
3497static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3498 long long __q0) {
3499 return __extension__(__m128i)(__v2di){__q0, __q1};
3500}
3501
3502/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3503/// the specified 64-bit integer values.
3504///
3505/// \headerfile <x86intrin.h>
3506///
3507/// This intrinsic is a utility function and does not correspond to a specific
3508/// instruction.
3509///
3510/// \param __q1
3511/// A 64-bit integer value used to initialize the upper 64 bits of the
3512/// destination vector of [2 x i64].
3513/// \param __q0
3514/// A 64-bit integer value used to initialize the lower 64 bits of the
3515/// destination vector of [2 x i64].
3516/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3517/// provided in the operands.
3518static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3519 __m64 __q0) {
3520 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3521}
3522
3523/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3524/// the specified 32-bit integer values.
3525///
3526/// \headerfile <x86intrin.h>
3527///
3528/// This intrinsic is a utility function and does not correspond to a specific
3529/// instruction.
3530///
3531/// \param __i3
3532/// A 32-bit integer value used to initialize bits [127:96] of the
3533/// destination vector.
3534/// \param __i2
3535/// A 32-bit integer value used to initialize bits [95:64] of the destination
3536/// vector.
3537/// \param __i1
3538/// A 32-bit integer value used to initialize bits [63:32] of the destination
3539/// vector.
3540/// \param __i0
3541/// A 32-bit integer value used to initialize bits [31:0] of the destination
3542/// vector.
3543/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3544/// provided in the operands.
3545static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3546 int __i1, int __i0) {
3547 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3548}
3549
3550/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3551/// the specified 16-bit integer values.
3552///
3553/// \headerfile <x86intrin.h>
3554///
3555/// This intrinsic is a utility function and does not correspond to a specific
3556/// instruction.
3557///
3558/// \param __w7
3559/// A 16-bit integer value used to initialize bits [127:112] of the
3560/// destination vector.
3561/// \param __w6
3562/// A 16-bit integer value used to initialize bits [111:96] of the
3563/// destination vector.
3564/// \param __w5
3565/// A 16-bit integer value used to initialize bits [95:80] of the destination
3566/// vector.
3567/// \param __w4
3568/// A 16-bit integer value used to initialize bits [79:64] of the destination
3569/// vector.
3570/// \param __w3
3571/// A 16-bit integer value used to initialize bits [63:48] of the destination
3572/// vector.
3573/// \param __w2
3574/// A 16-bit integer value used to initialize bits [47:32] of the destination
3575/// vector.
3576/// \param __w1
3577/// A 16-bit integer value used to initialize bits [31:16] of the destination
3578/// vector.
3579/// \param __w0
3580/// A 16-bit integer value used to initialize bits [15:0] of the destination
3581/// vector.
3582/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3583/// provided in the operands.
3584static __inline__ __m128i __DEFAULT_FN_ATTRS
3585_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3586 short __w2, short __w1, short __w0) {
3587 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3588 __w4, __w5, __w6, __w7};
3589}
3590
3591/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3592/// the specified 8-bit integer values.
3593///
3594/// \headerfile <x86intrin.h>
3595///
3596/// This intrinsic is a utility function and does not correspond to a specific
3597/// instruction.
3598///
3599/// \param __b15
3600/// Initializes bits [127:120] of the destination vector.
3601/// \param __b14
3602/// Initializes bits [119:112] of the destination vector.
3603/// \param __b13
3604/// Initializes bits [111:104] of the destination vector.
3605/// \param __b12
3606/// Initializes bits [103:96] of the destination vector.
3607/// \param __b11
3608/// Initializes bits [95:88] of the destination vector.
3609/// \param __b10
3610/// Initializes bits [87:80] of the destination vector.
3611/// \param __b9
3612/// Initializes bits [79:72] of the destination vector.
3613/// \param __b8
3614/// Initializes bits [71:64] of the destination vector.
3615/// \param __b7
3616/// Initializes bits [63:56] of the destination vector.
3617/// \param __b6
3618/// Initializes bits [55:48] of the destination vector.
3619/// \param __b5
3620/// Initializes bits [47:40] of the destination vector.
3621/// \param __b4
3622/// Initializes bits [39:32] of the destination vector.
3623/// \param __b3
3624/// Initializes bits [31:24] of the destination vector.
3625/// \param __b2
3626/// Initializes bits [23:16] of the destination vector.
3627/// \param __b1
3628/// Initializes bits [15:8] of the destination vector.
3629/// \param __b0
3630/// Initializes bits [7:0] of the destination vector.
3631/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3632/// provided in the operands.
3633static __inline__ __m128i __DEFAULT_FN_ATTRS
3634_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3635 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3636 char __b4, char __b3, char __b2, char __b1, char __b0) {
3637 return __extension__(__m128i)(__v16qi){
3638 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3639 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3640}
3641
3642/// Initializes both values in a 128-bit integer vector with the
3643/// specified 64-bit integer value.
3644///
3645/// \headerfile <x86intrin.h>
3646///
3647/// This intrinsic is a utility function and does not correspond to a specific
3648/// instruction.
3649///
3650/// \param __q
3651/// Integer value used to initialize the elements of the destination integer
3652/// vector.
3653/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3654/// elements containing the value provided in the operand.
3655static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3656 return _mm_set_epi64x(__q, __q);
3657}
3658
3659/// Initializes both values in a 128-bit vector of [2 x i64] with the
3660/// specified 64-bit value.
3661///
3662/// \headerfile <x86intrin.h>
3663///
3664/// This intrinsic is a utility function and does not correspond to a specific
3665/// instruction.
3666///
3667/// \param __q
3668/// A 64-bit value used to initialize the elements of the destination integer
3669/// vector.
3670/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3671/// containing the value provided in the operand.
3672static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3673 return _mm_set_epi64(__q, __q);
3674}
3675
3676/// Initializes all values in a 128-bit vector of [4 x i32] with the
3677/// specified 32-bit value.
3678///
3679/// \headerfile <x86intrin.h>
3680///
3681/// This intrinsic is a utility function and does not correspond to a specific
3682/// instruction.
3683///
3684/// \param __i
3685/// A 32-bit value used to initialize the elements of the destination integer
3686/// vector.
3687/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3688/// containing the value provided in the operand.
3689static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3690 return _mm_set_epi32(__i, __i, __i, __i);
3691}
3692
3693/// Initializes all values in a 128-bit vector of [8 x i16] with the
3694/// specified 16-bit value.
3695///
3696/// \headerfile <x86intrin.h>
3697///
3698/// This intrinsic is a utility function and does not correspond to a specific
3699/// instruction.
3700///
3701/// \param __w
3702/// A 16-bit value used to initialize the elements of the destination integer
3703/// vector.
3704/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3705/// containing the value provided in the operand.
3706static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3707 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3708}
3709
3710/// Initializes all values in a 128-bit vector of [16 x i8] with the
3711/// specified 8-bit value.
3712///
3713/// \headerfile <x86intrin.h>
3714///
3715/// This intrinsic is a utility function and does not correspond to a specific
3716/// instruction.
3717///
3718/// \param __b
3719/// An 8-bit value used to initialize the elements of the destination integer
3720/// vector.
3721/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3722/// containing the value provided in the operand.
3723static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3724 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3725 __b, __b, __b, __b, __b);
3726}
3727
3728/// Constructs a 128-bit integer vector, initialized in reverse order
3729/// with the specified 64-bit integral values.
3730///
3731/// \headerfile <x86intrin.h>
3732///
3733/// This intrinsic does not correspond to a specific instruction.
3734///
3735/// \param __q0
3736/// A 64-bit integral value used to initialize the lower 64 bits of the
3737/// result.
3738/// \param __q1
3739/// A 64-bit integral value used to initialize the upper 64 bits of the
3740/// result.
3741/// \returns An initialized 128-bit integer vector.
3742static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3743 __m64 __q1) {
3744 return _mm_set_epi64(__q1, __q0);
3745}
3746
3747/// Constructs a 128-bit integer vector, initialized in reverse order
3748/// with the specified 32-bit integral values.
3749///
3750/// \headerfile <x86intrin.h>
3751///
3752/// This intrinsic is a utility function and does not correspond to a specific
3753/// instruction.
3754///
3755/// \param __i0
3756/// A 32-bit integral value used to initialize bits [31:0] of the result.
3757/// \param __i1
3758/// A 32-bit integral value used to initialize bits [63:32] of the result.
3759/// \param __i2
3760/// A 32-bit integral value used to initialize bits [95:64] of the result.
3761/// \param __i3
3762/// A 32-bit integral value used to initialize bits [127:96] of the result.
3763/// \returns An initialized 128-bit integer vector.
3764static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3765 int __i2,
3766 int __i3) {
3767 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3768}
3769
3770/// Constructs a 128-bit integer vector, initialized in reverse order
3771/// with the specified 16-bit integral values.
3772///
3773/// \headerfile <x86intrin.h>
3774///
3775/// This intrinsic is a utility function and does not correspond to a specific
3776/// instruction.
3777///
3778/// \param __w0
3779/// A 16-bit integral value used to initialize bits [15:0] of the result.
3780/// \param __w1
3781/// A 16-bit integral value used to initialize bits [31:16] of the result.
3782/// \param __w2
3783/// A 16-bit integral value used to initialize bits [47:32] of the result.
3784/// \param __w3
3785/// A 16-bit integral value used to initialize bits [63:48] of the result.
3786/// \param __w4
3787/// A 16-bit integral value used to initialize bits [79:64] of the result.
3788/// \param __w5
3789/// A 16-bit integral value used to initialize bits [95:80] of the result.
3790/// \param __w6
3791/// A 16-bit integral value used to initialize bits [111:96] of the result.
3792/// \param __w7
3793/// A 16-bit integral value used to initialize bits [127:112] of the result.
3794/// \returns An initialized 128-bit integer vector.
3795static __inline__ __m128i __DEFAULT_FN_ATTRS
3796_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3797 short __w5, short __w6, short __w7) {
3798 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3799}
3800
3801/// Constructs a 128-bit integer vector, initialized in reverse order
3802/// with the specified 8-bit integral values.
3803///
3804/// \headerfile <x86intrin.h>
3805///
3806/// This intrinsic is a utility function and does not correspond to a specific
3807/// instruction.
3808///
3809/// \param __b0
3810/// An 8-bit integral value used to initialize bits [7:0] of the result.
3811/// \param __b1
3812/// An 8-bit integral value used to initialize bits [15:8] of the result.
3813/// \param __b2
3814/// An 8-bit integral value used to initialize bits [23:16] of the result.
3815/// \param __b3
3816/// An 8-bit integral value used to initialize bits [31:24] of the result.
3817/// \param __b4
3818/// An 8-bit integral value used to initialize bits [39:32] of the result.
3819/// \param __b5
3820/// An 8-bit integral value used to initialize bits [47:40] of the result.
3821/// \param __b6
3822/// An 8-bit integral value used to initialize bits [55:48] of the result.
3823/// \param __b7
3824/// An 8-bit integral value used to initialize bits [63:56] of the result.
3825/// \param __b8
3826/// An 8-bit integral value used to initialize bits [71:64] of the result.
3827/// \param __b9
3828/// An 8-bit integral value used to initialize bits [79:72] of the result.
3829/// \param __b10
3830/// An 8-bit integral value used to initialize bits [87:80] of the result.
3831/// \param __b11
3832/// An 8-bit integral value used to initialize bits [95:88] of the result.
3833/// \param __b12
3834/// An 8-bit integral value used to initialize bits [103:96] of the result.
3835/// \param __b13
3836/// An 8-bit integral value used to initialize bits [111:104] of the result.
3837/// \param __b14
3838/// An 8-bit integral value used to initialize bits [119:112] of the result.
3839/// \param __b15
3840/// An 8-bit integral value used to initialize bits [127:120] of the result.
3841/// \returns An initialized 128-bit integer vector.
3842static __inline__ __m128i __DEFAULT_FN_ATTRS
3843_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3844 char __b6, char __b7, char __b8, char __b9, char __b10,
3845 char __b11, char __b12, char __b13, char __b14, char __b15) {
3846 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3847 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3848}
3849
3850/// Creates a 128-bit integer vector initialized to zero.
3851///
3852/// \headerfile <x86intrin.h>
3853///
3854/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3855///
3856/// \returns An initialized 128-bit integer vector with all elements set to
3857/// zero.
3858static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3859 return __extension__(__m128i)(__v2di){0LL, 0LL};
3860}
3861
3862/// Stores a 128-bit integer vector to a memory location aligned on a
3863/// 128-bit boundary.
3864///
3865/// \headerfile <x86intrin.h>
3866///
3867/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3868///
3869/// \param __p
3870/// A pointer to an aligned memory location that will receive the integer
3871/// values.
3872/// \param __b
3873/// A 128-bit integer vector containing the values to be moved.
3874static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3875 __m128i __b) {
3876 *__p = __b;
3877}
3878
3879/// Stores a 128-bit integer vector to an unaligned memory location.
3880///
3881/// \headerfile <x86intrin.h>
3882///
3883/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3884///
3885/// \param __p
3886/// A pointer to a memory location that will receive the integer values.
3887/// \param __b
3888/// A 128-bit integer vector containing the values to be moved.
3889static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3890 __m128i __b) {
3891 struct __storeu_si128 {
3892 __m128i_u __v;
3893 } __attribute__((__packed__, __may_alias__));
3894 ((struct __storeu_si128 *)__p)->__v = __b;
3895}
3896
3897/// Stores a 64-bit integer value from the low element of a 128-bit integer
3898/// vector.
3899///
3900/// \headerfile <x86intrin.h>
3901///
3902/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3903///
3904/// \param __p
3905/// A pointer to a 64-bit memory location. The address of the memory
3906/// location does not have to be aligned.
3907/// \param __b
3908/// A 128-bit integer vector containing the value to be stored.
3909static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3910 __m128i __b) {
3911 struct __storeu_si64 {
3912 long long __v;
3913 } __attribute__((__packed__, __may_alias__));
3914 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3915}
3916
3917/// Stores a 32-bit integer value from the low element of a 128-bit integer
3918/// vector.
3919///
3920/// \headerfile <x86intrin.h>
3921///
3922/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3923///
3924/// \param __p
3925/// A pointer to a 32-bit memory location. The address of the memory
3926/// location does not have to be aligned.
3927/// \param __b
3928/// A 128-bit integer vector containing the value to be stored.
3929static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3930 __m128i __b) {
3931 struct __storeu_si32 {
3932 int __v;
3933 } __attribute__((__packed__, __may_alias__));
3934 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3935}
3936
3937/// Stores a 16-bit integer value from the low element of a 128-bit integer
3938/// vector.
3939///
3940/// \headerfile <x86intrin.h>
3941///
3942/// This intrinsic does not correspond to a specific instruction.
3943///
3944/// \param __p
3945/// A pointer to a 16-bit memory location. The address of the memory
3946/// location does not have to be aligned.
3947/// \param __b
3948/// A 128-bit integer vector containing the value to be stored.
3949static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3950 __m128i __b) {
3951 struct __storeu_si16 {
3952 short __v;
3953 } __attribute__((__packed__, __may_alias__));
3954 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3955}
3956
3957/// Moves bytes selected by the mask from the first operand to the
3958/// specified unaligned memory location. When a mask bit is 1, the
3959/// corresponding byte is written, otherwise it is not written.
3960///
3961/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3962/// used again soon). Exception and trap behavior for elements not selected
3963/// for storage to memory are implementation dependent.
3964///
3965/// \headerfile <x86intrin.h>
3966///
3967/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3968/// instruction.
3969///
3970/// \param __d
3971/// A 128-bit integer vector containing the values to be moved.
3972/// \param __n
3973/// A 128-bit integer vector containing the mask. The most significant bit of
3974/// each byte represents the mask bits.
3975/// \param __p
3976/// A pointer to an unaligned 128-bit memory location where the specified
3977/// values are moved.
3978static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3979 __m128i __n,
3980 char *__p) {
3981 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3982}
3983
3984/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3985/// a memory location.
3986///
3987/// \headerfile <x86intrin.h>
3988///
3989/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3990///
3991/// \param __p
3992/// A pointer to a 64-bit memory location that will receive the lower 64 bits
3993/// of the integer vector parameter.
3994/// \param __a
3995/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3996/// value to be stored.
3997static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3998 __m128i __a) {
3999 struct __mm_storel_epi64_struct {
4000 long long __u;
4001 } __attribute__((__packed__, __may_alias__));
4002 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4003}
4004
4005/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4006/// aligned memory location.
4007///
4008/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4009/// used again soon).
4010///
4011/// \headerfile <x86intrin.h>
4012///
4013/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4014///
4015/// \param __p
4016/// A pointer to the 128-bit aligned memory location used to store the value.
4017/// \param __a
4018/// A vector of [2 x double] containing the 64-bit values to be stored.
4019static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4020 __m128d __a) {
4021 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4022}
4023
4024/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4025///
4026/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4027/// used again soon).
4028///
4029/// \headerfile <x86intrin.h>
4030///
4031/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4032///
4033/// \param __p
4034/// A pointer to the 128-bit aligned memory location used to store the value.
4035/// \param __a
4036/// A 128-bit integer vector containing the values to be stored.
4037static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4038 __m128i __a) {
4039 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4040}
4041
4042/// Stores a 32-bit integer value in the specified memory location.
4043///
4044/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4045/// used again soon).
4046///
4047/// \headerfile <x86intrin.h>
4048///
4049/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4050///
4051/// \param __p
4052/// A pointer to the 32-bit memory location used to store the value.
4053/// \param __a
4054/// A 32-bit integer containing the value to be stored.
4055static __inline__ void
4056 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4057 _mm_stream_si32(void *__p, int __a) {
4058 __builtin_ia32_movnti((int *)__p, __a);
4059}
4060
4061#ifdef __x86_64__
4062/// Stores a 64-bit integer value in the specified memory location.
4063///
4064/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4065/// used again soon).
4066///
4067/// \headerfile <x86intrin.h>
4068///
4069/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4070///
4071/// \param __p
4072/// A pointer to the 64-bit memory location used to store the value.
4073/// \param __a
4074/// A 64-bit integer containing the value to be stored.
4075static __inline__ void
4076 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4077 _mm_stream_si64(void *__p, long long __a) {
4078 __builtin_ia32_movnti64((long long *)__p, __a);
4079}
4080#endif
4081
4082#if defined(__cplusplus)
4083extern "C" {
4084#endif
4085
4086/// The cache line containing \a __p is flushed and invalidated from all
4087/// caches in the coherency domain.
4088///
4089/// \headerfile <x86intrin.h>
4090///
4091/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4092///
4093/// \param __p
4094/// A pointer to the memory location used to identify the cache line to be
4095/// flushed.
4096void _mm_clflush(void const *__p);
4097
4098/// Forces strong memory ordering (serialization) between load
4099/// instructions preceding this instruction and load instructions following
4100/// this instruction, ensuring the system completes all previous loads before
4101/// executing subsequent loads.
4102///
4103/// \headerfile <x86intrin.h>
4104///
4105/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4106///
4107void _mm_lfence(void);
4108
4109/// Forces strong memory ordering (serialization) between load and store
4110/// instructions preceding this instruction and load and store instructions
4111/// following this instruction, ensuring that the system completes all
4112/// previous memory accesses before executing subsequent memory accesses.
4113///
4114/// \headerfile <x86intrin.h>
4115///
4116/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4117///
4118void _mm_mfence(void);
4119
4120#if defined(__cplusplus)
4121} // extern "C"
4122#endif
4123
4124/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4125/// vector operands into 8-bit signed integers, and packs the results into
4126/// the destination.
4127///
4128/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4129/// less than 0x80 are saturated to 0x80.
4130///
4131/// \headerfile <x86intrin.h>
4132///
4133/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4134///
4135/// \param __a
4136/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4137/// written to the lower 64 bits of the result.
4138/// \param __b
4139/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4140/// written to the higher 64 bits of the result.
4141/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4142static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4143 __m128i __b) {
4144 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4145}
4146
4147/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4148/// vector operands into 16-bit signed integers, and packs the results into
4149/// the destination.
4150///
4151/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4152/// values less than 0x8000 are saturated to 0x8000.
4153///
4154/// \headerfile <x86intrin.h>
4155///
4156/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4157///
4158/// \param __a
4159/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4160/// are written to the lower 64 bits of the result.
4161/// \param __b
4162/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4163/// are written to the higher 64 bits of the result.
4164/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4165static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4166 __m128i __b) {
4167 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4168}
4169
4170/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4171/// vector operands into 8-bit unsigned integers, and packs the results into
4172/// the destination.
4173///
4174/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4175/// are saturated to 0x00.
4176///
4177/// \headerfile <x86intrin.h>
4178///
4179/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4180///
4181/// \param __a
4182/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4183/// written to the lower 64 bits of the result.
4184/// \param __b
4185/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4186/// written to the higher 64 bits of the result.
4187/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4188static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4189 __m128i __b) {
4190 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4191}
4192
4193/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4194/// the immediate-value parameter as a selector.
4195///
4196/// \headerfile <x86intrin.h>
4197///
4198/// \code
4199/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4200/// \endcode
4201///
4202/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4203///
4204/// \param a
4205/// A 128-bit integer vector.
4206/// \param imm
4207/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4208/// to bits[15:0] of the result. \n
4209/// 000: assign values from bits [15:0] of \a a. \n
4210/// 001: assign values from bits [31:16] of \a a. \n
4211/// 010: assign values from bits [47:32] of \a a. \n
4212/// 011: assign values from bits [63:48] of \a a. \n
4213/// 100: assign values from bits [79:64] of \a a. \n
4214/// 101: assign values from bits [95:80] of \a a. \n
4215/// 110: assign values from bits [111:96] of \a a. \n
4216/// 111: assign values from bits [127:112] of \a a.
4217/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4218/// integer vector parameter and the remaining bits are assigned zeros.
4219#define _mm_extract_epi16(a, imm) \
4220 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4221 (int)(imm)))
4222
4223/// Constructs a 128-bit integer vector by first making a copy of the
4224/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4225/// of an integer parameter into an offset specified by the immediate-value
4226/// parameter.
4227///
4228/// \headerfile <x86intrin.h>
4229///
4230/// \code
4231/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4232/// \endcode
4233///
4234/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4235///
4236/// \param a
4237/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4238/// result and then one of the eight elements in the result is replaced by
4239/// the lower 16 bits of \a b.
4240/// \param b
4241/// An integer. The lower 16 bits of this parameter are written to the
4242/// result beginning at an offset specified by \a imm.
4243/// \param imm
4244/// An immediate value specifying the bit offset in the result at which the
4245/// lower 16 bits of \a b are written.
4246/// \returns A 128-bit integer vector containing the constructed values.
4247#define _mm_insert_epi16(a, b, imm) \
4248 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4249 (int)(imm)))
4250
4251/// Copies the values of the most significant bits from each 8-bit
4252/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4253/// value, zero-extends the value, and writes it to the destination.
4254///
4255/// \headerfile <x86intrin.h>
4256///
4257/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4258///
4259/// \param __a
4260/// A 128-bit integer vector containing the values with bits to be extracted.
4261/// \returns The most significant bits from each 8-bit element in \a __a,
4262/// written to bits [15:0]. The other bits are assigned zeros.
4263static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4264 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4265}
4266
4267/// Constructs a 128-bit integer vector by shuffling four 32-bit
4268/// elements of a 128-bit integer vector parameter, using the immediate-value
4269/// parameter as a specifier.
4270///
4271/// \headerfile <x86intrin.h>
4272///
4273/// \code
4274/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4275/// \endcode
4276///
4277/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4278///
4279/// \param a
4280/// A 128-bit integer vector containing the values to be copied.
4281/// \param imm
4282/// An immediate value containing an 8-bit value specifying which elements to
4283/// copy from a. The destinations within the 128-bit destination are assigned
4284/// values as follows: \n
4285/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4286/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4287/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4288/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4289/// Bit value assignments: \n
4290/// 00: assign values from bits [31:0] of \a a. \n
4291/// 01: assign values from bits [63:32] of \a a. \n
4292/// 10: assign values from bits [95:64] of \a a. \n
4293/// 11: assign values from bits [127:96] of \a a. \n
4294/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4295/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4296/// <c>[b6, b4, b2, b0]</c>.
4297/// \returns A 128-bit integer vector containing the shuffled values.
4298#define _mm_shuffle_epi32(a, imm) \
4299 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4300
4301/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4302/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4303/// value parameter as a specifier.
4304///
4305/// \headerfile <x86intrin.h>
4306///
4307/// \code
4308/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4309/// \endcode
4310///
4311/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4312///
4313/// \param a
4314/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4315/// [127:64] of the result.
4316/// \param imm
4317/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4318/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4319/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4320/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4321/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4322/// Bit value assignments: \n
4323/// 00: assign values from bits [15:0] of \a a. \n
4324/// 01: assign values from bits [31:16] of \a a. \n
4325/// 10: assign values from bits [47:32] of \a a. \n
4326/// 11: assign values from bits [63:48] of \a a. \n
4327/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4328/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4329/// <c>[b6, b4, b2, b0]</c>.
4330/// \returns A 128-bit integer vector containing the shuffled values.
4331#define _mm_shufflelo_epi16(a, imm) \
4332 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4333
4334/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4335/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4336/// value parameter as a specifier.
4337///
4338/// \headerfile <x86intrin.h>
4339///
4340/// \code
4341/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4342/// \endcode
4343///
4344/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4345///
4346/// \param a
4347/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4348/// [63:0] of the result.
4349/// \param imm
4350/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4351/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4352/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4353/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4354/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4355/// Bit value assignments: \n
4356/// 00: assign values from bits [79:64] of \a a. \n
4357/// 01: assign values from bits [95:80] of \a a. \n
4358/// 10: assign values from bits [111:96] of \a a. \n
4359/// 11: assign values from bits [127:112] of \a a. \n
4360/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4361/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4362/// <c>[b6, b4, b2, b0]</c>.
4363/// \returns A 128-bit integer vector containing the shuffled values.
4364#define _mm_shufflehi_epi16(a, imm) \
4365 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4366
4367/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4368/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4369///
4370/// \headerfile <x86intrin.h>
4371///
4372/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4373/// instruction.
4374///
4375/// \param __a
4376/// A 128-bit vector of [16 x i8].
4377/// Bits [71:64] are written to bits [7:0] of the result. \n
4378/// Bits [79:72] are written to bits [23:16] of the result. \n
4379/// Bits [87:80] are written to bits [39:32] of the result. \n
4380/// Bits [95:88] are written to bits [55:48] of the result. \n
4381/// Bits [103:96] are written to bits [71:64] of the result. \n
4382/// Bits [111:104] are written to bits [87:80] of the result. \n
4383/// Bits [119:112] are written to bits [103:96] of the result. \n
4384/// Bits [127:120] are written to bits [119:112] of the result.
4385/// \param __b
4386/// A 128-bit vector of [16 x i8]. \n
4387/// Bits [71:64] are written to bits [15:8] of the result. \n
4388/// Bits [79:72] are written to bits [31:24] of the result. \n
4389/// Bits [87:80] are written to bits [47:40] of the result. \n
4390/// Bits [95:88] are written to bits [63:56] of the result. \n
4391/// Bits [103:96] are written to bits [79:72] of the result. \n
4392/// Bits [111:104] are written to bits [95:88] of the result. \n
4393/// Bits [119:112] are written to bits [111:104] of the result. \n
4394/// Bits [127:120] are written to bits [127:120] of the result.
4395/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4396static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4397 __m128i __b) {
4398 return (__m128i)__builtin_shufflevector(
4399 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4400 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4401}
4402
4403/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4404/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4405///
4406/// \headerfile <x86intrin.h>
4407///
4408/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4409/// instruction.
4410///
4411/// \param __a
4412/// A 128-bit vector of [8 x i16].
4413/// Bits [79:64] are written to bits [15:0] of the result. \n
4414/// Bits [95:80] are written to bits [47:32] of the result. \n
4415/// Bits [111:96] are written to bits [79:64] of the result. \n
4416/// Bits [127:112] are written to bits [111:96] of the result.
4417/// \param __b
4418/// A 128-bit vector of [8 x i16].
4419/// Bits [79:64] are written to bits [31:16] of the result. \n
4420/// Bits [95:80] are written to bits [63:48] of the result. \n
4421/// Bits [111:96] are written to bits [95:80] of the result. \n
4422/// Bits [127:112] are written to bits [127:112] of the result.
4423/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4424static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4425 __m128i __b) {
4426 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4427 8 + 5, 6, 8 + 6, 7, 8 + 7);
4428}
4429
4430/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4431/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4432///
4433/// \headerfile <x86intrin.h>
4434///
4435/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4436/// instruction.
4437///
4438/// \param __a
4439/// A 128-bit vector of [4 x i32]. \n
4440/// Bits [95:64] are written to bits [31:0] of the destination. \n
4441/// Bits [127:96] are written to bits [95:64] of the destination.
4442/// \param __b
4443/// A 128-bit vector of [4 x i32]. \n
4444/// Bits [95:64] are written to bits [64:32] of the destination. \n
4445/// Bits [127:96] are written to bits [127:96] of the destination.
4446/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4447static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4448 __m128i __b) {
4449 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4450 4 + 3);
4451}
4452
4453/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4454/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4455///
4456/// \headerfile <x86intrin.h>
4457///
4458/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4459/// instruction.
4460///
4461/// \param __a
4462/// A 128-bit vector of [2 x i64]. \n
4463/// Bits [127:64] are written to bits [63:0] of the destination.
4464/// \param __b
4465/// A 128-bit vector of [2 x i64]. \n
4466/// Bits [127:64] are written to bits [127:64] of the destination.
4467/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4468static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4469 __m128i __b) {
4470 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4471}
4472
4473/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4474/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4475///
4476/// \headerfile <x86intrin.h>
4477///
4478/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4479/// instruction.
4480///
4481/// \param __a
4482/// A 128-bit vector of [16 x i8]. \n
4483/// Bits [7:0] are written to bits [7:0] of the result. \n
4484/// Bits [15:8] are written to bits [23:16] of the result. \n
4485/// Bits [23:16] are written to bits [39:32] of the result. \n
4486/// Bits [31:24] are written to bits [55:48] of the result. \n
4487/// Bits [39:32] are written to bits [71:64] of the result. \n
4488/// Bits [47:40] are written to bits [87:80] of the result. \n
4489/// Bits [55:48] are written to bits [103:96] of the result. \n
4490/// Bits [63:56] are written to bits [119:112] of the result.
4491/// \param __b
4492/// A 128-bit vector of [16 x i8].
4493/// Bits [7:0] are written to bits [15:8] of the result. \n
4494/// Bits [15:8] are written to bits [31:24] of the result. \n
4495/// Bits [23:16] are written to bits [47:40] of the result. \n
4496/// Bits [31:24] are written to bits [63:56] of the result. \n
4497/// Bits [39:32] are written to bits [79:72] of the result. \n
4498/// Bits [47:40] are written to bits [95:88] of the result. \n
4499/// Bits [55:48] are written to bits [111:104] of the result. \n
4500/// Bits [63:56] are written to bits [127:120] of the result.
4501/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4502static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4503 __m128i __b) {
4504 return (__m128i)__builtin_shufflevector(
4505 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4506 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4507}
4508
4509/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4510/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4511/// [8 x i16].
4512///
4513/// \headerfile <x86intrin.h>
4514///
4515/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4516/// instruction.
4517///
4518/// \param __a
4519/// A 128-bit vector of [8 x i16].
4520/// Bits [15:0] are written to bits [15:0] of the result. \n
4521/// Bits [31:16] are written to bits [47:32] of the result. \n
4522/// Bits [47:32] are written to bits [79:64] of the result. \n
4523/// Bits [63:48] are written to bits [111:96] of the result.
4524/// \param __b
4525/// A 128-bit vector of [8 x i16].
4526/// Bits [15:0] are written to bits [31:16] of the result. \n
4527/// Bits [31:16] are written to bits [63:48] of the result. \n
4528/// Bits [47:32] are written to bits [95:80] of the result. \n
4529/// Bits [63:48] are written to bits [127:112] of the result.
4530/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4531static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4532 __m128i __b) {
4533 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4534 8 + 1, 2, 8 + 2, 3, 8 + 3);
4535}
4536
4537/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4538/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4539///
4540/// \headerfile <x86intrin.h>
4541///
4542/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4543/// instruction.
4544///
4545/// \param __a
4546/// A 128-bit vector of [4 x i32]. \n
4547/// Bits [31:0] are written to bits [31:0] of the destination. \n
4548/// Bits [63:32] are written to bits [95:64] of the destination.
4549/// \param __b
4550/// A 128-bit vector of [4 x i32]. \n
4551/// Bits [31:0] are written to bits [64:32] of the destination. \n
4552/// Bits [63:32] are written to bits [127:96] of the destination.
4553/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4554static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4555 __m128i __b) {
4556 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4557 4 + 1);
4558}
4559
4560/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4561/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4562///
4563/// \headerfile <x86intrin.h>
4564///
4565/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4566/// instruction.
4567///
4568/// \param __a
4569/// A 128-bit vector of [2 x i64]. \n
4570/// Bits [63:0] are written to bits [63:0] of the destination. \n
4571/// \param __b
4572/// A 128-bit vector of [2 x i64]. \n
4573/// Bits [63:0] are written to bits [127:64] of the destination. \n
4574/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4575static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4576 __m128i __b) {
4577 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4578}
4579
4580/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4581/// integer.
4582///
4583/// \headerfile <x86intrin.h>
4584///
4585/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4586///
4587/// \param __a
4588/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4589/// destination.
4590/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4591static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4592 return (__m64)__a[0];
4593}
4594
4595/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4596/// upper bits.
4597///
4598/// \headerfile <x86intrin.h>
4599///
4600/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4601///
4602/// \param __a
4603/// A 64-bit value.
4604/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4605/// the operand. The upper 64 bits are assigned zeros.
4606static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4607 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4608}
4609
4610/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4611/// integer vector, zeroing the upper bits.
4612///
4613/// \headerfile <x86intrin.h>
4614///
4615/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4616///
4617/// \param __a
4618/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4619/// destination.
4620/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4621/// the operand. The upper 64 bits are assigned zeros.
4622static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4623 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4624}
4625
4626/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4627/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4628/// double].
4629///
4630/// \headerfile <x86intrin.h>
4631///
4632/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4633///
4634/// \param __a
4635/// A 128-bit vector of [2 x double]. \n
4636/// Bits [127:64] are written to bits [63:0] of the destination.
4637/// \param __b
4638/// A 128-bit vector of [2 x double]. \n
4639/// Bits [127:64] are written to bits [127:64] of the destination.
4640/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4641static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4642 __m128d __b) {
4643 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4644}
4645
4646/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4647/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4648/// double].
4649///
4650/// \headerfile <x86intrin.h>
4651///
4652/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4653///
4654/// \param __a
4655/// A 128-bit vector of [2 x double]. \n
4656/// Bits [63:0] are written to bits [63:0] of the destination.
4657/// \param __b
4658/// A 128-bit vector of [2 x double]. \n
4659/// Bits [63:0] are written to bits [127:64] of the destination.
4660/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4661static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4662 __m128d __b) {
4663 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4664}
4665
4666/// Extracts the sign bits of the double-precision values in the 128-bit
4667/// vector of [2 x double], zero-extends the value, and writes it to the
4668/// low-order bits of the destination.
4669///
4670/// \headerfile <x86intrin.h>
4671///
4672/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4673///
4674/// \param __a
4675/// A 128-bit vector of [2 x double] containing the values with sign bits to
4676/// be extracted.
4677/// \returns The sign bits from each of the double-precision elements in \a __a,
4678/// written to bits [1:0]. The remaining bits are assigned values of zero.
4679static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4680 return __builtin_ia32_movmskpd((__v2df)__a);
4681}
4682
4683/// Constructs a 128-bit floating-point vector of [2 x double] from two
4684/// 128-bit vector parameters of [2 x double], using the immediate-value
4685/// parameter as a specifier.
4686///
4687/// \headerfile <x86intrin.h>
4688///
4689/// \code
4690/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4691/// \endcode
4692///
4693/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4694///
4695/// \param a
4696/// A 128-bit vector of [2 x double].
4697/// \param b
4698/// A 128-bit vector of [2 x double].
4699/// \param i
4700/// An 8-bit immediate value. The least significant two bits specify which
4701/// elements to copy from \a a and \a b: \n
4702/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4703/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4704/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4705/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4706/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4707/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4708/// <c>[b1, b0]</c>.
4709/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4710#define _mm_shuffle_pd(a, b, i) \
4711 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4712 (int)(i)))
4713
4714/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4715/// floating-point vector of [4 x float].
4716///
4717/// \headerfile <x86intrin.h>
4718///
4719/// This intrinsic has no corresponding instruction.
4720///
4721/// \param __a
4722/// A 128-bit floating-point vector of [2 x double].
4723/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4724/// bitwise pattern as the parameter.
4725static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4726 return (__m128)__a;
4727}
4728
4729/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4730/// integer vector.
4731///
4732/// \headerfile <x86intrin.h>
4733///
4734/// This intrinsic has no corresponding instruction.
4735///
4736/// \param __a
4737/// A 128-bit floating-point vector of [2 x double].
4738/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4739/// parameter.
4740static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4741 return (__m128i)__a;
4742}
4743
4744/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4745/// floating-point vector of [2 x double].
4746///
4747/// \headerfile <x86intrin.h>
4748///
4749/// This intrinsic has no corresponding instruction.
4750///
4751/// \param __a
4752/// A 128-bit floating-point vector of [4 x float].
4753/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4754/// bitwise pattern as the parameter.
4755static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4756 return (__m128d)__a;
4757}
4758
4759/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4760/// integer vector.
4761///
4762/// \headerfile <x86intrin.h>
4763///
4764/// This intrinsic has no corresponding instruction.
4765///
4766/// \param __a
4767/// A 128-bit floating-point vector of [4 x float].
4768/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4769/// parameter.
4770static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4771 return (__m128i)__a;
4772}
4773
4774/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4775/// of [4 x float].
4776///
4777/// \headerfile <x86intrin.h>
4778///
4779/// This intrinsic has no corresponding instruction.
4780///
4781/// \param __a
4782/// A 128-bit integer vector.
4783/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4784/// bitwise pattern as the parameter.
4785static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4786 return (__m128)__a;
4787}
4788
4789/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4790/// of [2 x double].
4791///
4792/// \headerfile <x86intrin.h>
4793///
4794/// This intrinsic has no corresponding instruction.
4795///
4796/// \param __a
4797/// A 128-bit integer vector.
4798/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4799/// bitwise pattern as the parameter.
4800static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4801 return (__m128d)__a;
4802}
4803
4804/// Compares each of the corresponding double-precision values of two
4805/// 128-bit vectors of [2 x double], using the operation specified by the
4806/// immediate integer operand.
4807///
4808/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4809/// If either value in a comparison is NaN, comparisons that are ordered
4810/// return false, and comparisons that are unordered return true.
4811///
4812/// \headerfile <x86intrin.h>
4813///
4814/// \code
4815/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4816/// \endcode
4817///
4818/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4819///
4820/// \param a
4821/// A 128-bit vector of [2 x double].
4822/// \param b
4823/// A 128-bit vector of [2 x double].
4824/// \param c
4825/// An immediate integer operand, with bits [4:0] specifying which comparison
4826/// operation to use: \n
4827/// 0x00: Equal (ordered, non-signaling) \n
4828/// 0x01: Less-than (ordered, signaling) \n
4829/// 0x02: Less-than-or-equal (ordered, signaling) \n
4830/// 0x03: Unordered (non-signaling) \n
4831/// 0x04: Not-equal (unordered, non-signaling) \n
4832/// 0x05: Not-less-than (unordered, signaling) \n
4833/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4834/// 0x07: Ordered (non-signaling) \n
4835/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4836#define _mm_cmp_pd(a, b, c) \
4837 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4838 (c)))
4839
4840/// Compares each of the corresponding scalar double-precision values of
4841/// two 128-bit vectors of [2 x double], using the operation specified by the
4842/// immediate integer operand.
4843///
4844/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4845/// If either value in a comparison is NaN, comparisons that are ordered
4846/// return false, and comparisons that are unordered return true.
4847///
4848/// \headerfile <x86intrin.h>
4849///
4850/// \code
4851/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4852/// \endcode
4853///
4854/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4855///
4856/// \param a
4857/// A 128-bit vector of [2 x double].
4858/// \param b
4859/// A 128-bit vector of [2 x double].
4860/// \param c
4861/// An immediate integer operand, with bits [4:0] specifying which comparison
4862/// operation to use: \n
4863/// 0x00: Equal (ordered, non-signaling) \n
4864/// 0x01: Less-than (ordered, signaling) \n
4865/// 0x02: Less-than-or-equal (ordered, signaling) \n
4866/// 0x03: Unordered (non-signaling) \n
4867/// 0x04: Not-equal (unordered, non-signaling) \n
4868/// 0x05: Not-less-than (unordered, signaling) \n
4869/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4870/// 0x07: Ordered (non-signaling) \n
4871/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4872#define _mm_cmp_sd(a, b, c) \
4873 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4874 (c)))
4875
4876#if defined(__cplusplus)
4877extern "C" {
4878#endif
4879
4880/// Indicates that a spin loop is being executed for the purposes of
4881/// optimizing power consumption during the loop.
4882///
4883/// \headerfile <x86intrin.h>
4884///
4885/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4886///
4887void _mm_pause(void);
4888
4889#if defined(__cplusplus)
4890} // extern "C"
4891#endif
4892#undef __DEFAULT_FN_ATTRS
4893#undef __DEFAULT_FN_ATTRS_MMX
4894
4895#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4896
4897#define _MM_DENORMALS_ZERO_ON (0x0040U)
4898#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4899
4900#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4901
4902#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4903#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4904 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4905
4906#endif /* __EMMINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1489
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3742
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4531
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4606
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1953
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3585
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1020
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1805
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4188
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2359
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:585
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4740
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:398
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1646
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4037
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4263
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2811
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2662
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:820
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1186
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1609
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2559
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3410
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3978
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1162
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3545
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1210
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2154
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1553
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3075
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3002
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1789
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3215
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1823
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2507
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:742
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3997
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3235
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2697
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:519
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3949
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1665
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4641
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:767
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2681
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3020
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2416
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1138
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2866
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:415
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2321
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2258
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1933
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4800
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2984
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4502
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:793
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3155
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:973
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:52
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2581
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:717
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:669
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4575
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2904
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4679
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3113
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2645
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3929
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4622
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4468
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1470
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3313
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3379
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4554
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3175
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1876
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4424
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1337
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4770
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1234
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2283
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2220
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3364
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1627
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3038
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2757
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:606
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1379
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1356
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1735
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2198
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2176
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2603
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2490
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4447
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2378
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:564
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2397
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:381
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:996
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4725
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2847
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4142
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3689
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2885
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1593
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1068
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:648
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4755
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2009
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2793
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2473
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:947
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:847
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2028
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1914
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3723
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2775
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2052
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:922
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:498
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3425
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:692
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4019
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3195
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4785
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:361
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3497
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1401
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2452
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3056
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:872
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3350
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3796
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1709
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3634
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3672
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3518
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2542
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:277
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2829
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3909
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1893
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1569
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:55
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:344
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2340
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4591
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3874
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:435
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1773
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1523
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:323
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1538
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3460
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3655
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3394
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2524
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2239
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2434
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4396
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2966
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1970
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1258
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2073
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2624
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2111
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1294
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3858
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1116
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:541
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3889
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2923
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3764
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1450
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2714
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:627
struct __storeu_i16 *__P __v
Definition: immintrin.h:480