clang 19.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26/* Type defines. */
27typedef double __v2df __attribute__((__vector_size__(16)));
28typedef long long __v2di __attribute__((__vector_size__(16)));
29typedef short __v8hi __attribute__((__vector_size__(16)));
30typedef char __v16qi __attribute__((__vector_size__(16)));
31
32/* Unsigned types */
33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37/* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41#ifdef __SSE2__
42/* Both _Float16 and __bf16 require SSE2 being enabled. */
43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49#endif
50
51/* Define the default attributes for the functions in this file. */
52#define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55#define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58
59/// Adds lower double-precision values in both operands and returns the
60/// sum in the lower 64 bits of the result. The upper 64 bits of the result
61/// are copied from the upper double-precision value of the first operand.
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66///
67/// \param __a
68/// A 128-bit vector of [2 x double] containing one of the source operands.
69/// \param __b
70/// A 128-bit vector of [2 x double] containing one of the source operands.
71/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73/// from the upper 64 bits of the first source operand.
74static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75 __m128d __b) {
76 __a[0] += __b[0];
77 return __a;
78}
79
80/// Adds two 128-bit vectors of [2 x double].
81///
82/// \headerfile <x86intrin.h>
83///
84/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85///
86/// \param __a
87/// A 128-bit vector of [2 x double] containing one of the source operands.
88/// \param __b
89/// A 128-bit vector of [2 x double] containing one of the source operands.
90/// \returns A 128-bit vector of [2 x double] containing the sums of both
91/// operands.
92static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93 __m128d __b) {
94 return (__m128d)((__v2df)__a + (__v2df)__b);
95}
96
97/// Subtracts the lower double-precision value of the second operand
98/// from the lower double-precision value of the first operand and returns
99/// the difference in the lower 64 bits of the result. The upper 64 bits of
100/// the result are copied from the upper double-precision value of the first
101/// operand.
102///
103/// \headerfile <x86intrin.h>
104///
105/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106///
107/// \param __a
108/// A 128-bit vector of [2 x double] containing the minuend.
109/// \param __b
110/// A 128-bit vector of [2 x double] containing the subtrahend.
111/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112/// difference of the lower 64 bits of both operands. The upper 64 bits are
113/// copied from the upper 64 bits of the first source operand.
114static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115 __m128d __b) {
116 __a[0] -= __b[0];
117 return __a;
118}
119
120/// Subtracts two 128-bit vectors of [2 x double].
121///
122/// \headerfile <x86intrin.h>
123///
124/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125///
126/// \param __a
127/// A 128-bit vector of [2 x double] containing the minuend.
128/// \param __b
129/// A 128-bit vector of [2 x double] containing the subtrahend.
130/// \returns A 128-bit vector of [2 x double] containing the differences between
131/// both operands.
132static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133 __m128d __b) {
134 return (__m128d)((__v2df)__a - (__v2df)__b);
135}
136
137/// Multiplies lower double-precision values in both operands and returns
138/// the product in the lower 64 bits of the result. The upper 64 bits of the
139/// result are copied from the upper double-precision value of the first
140/// operand.
141///
142/// \headerfile <x86intrin.h>
143///
144/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145///
146/// \param __a
147/// A 128-bit vector of [2 x double] containing one of the source operands.
148/// \param __b
149/// A 128-bit vector of [2 x double] containing one of the source operands.
150/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151/// product of the lower 64 bits of both operands. The upper 64 bits are
152/// copied from the upper 64 bits of the first source operand.
153static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154 __m128d __b) {
155 __a[0] *= __b[0];
156 return __a;
157}
158
159/// Multiplies two 128-bit vectors of [2 x double].
160///
161/// \headerfile <x86intrin.h>
162///
163/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164///
165/// \param __a
166/// A 128-bit vector of [2 x double] containing one of the operands.
167/// \param __b
168/// A 128-bit vector of [2 x double] containing one of the operands.
169/// \returns A 128-bit vector of [2 x double] containing the products of both
170/// operands.
171static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172 __m128d __b) {
173 return (__m128d)((__v2df)__a * (__v2df)__b);
174}
175
176/// Divides the lower double-precision value of the first operand by the
177/// lower double-precision value of the second operand and returns the
178/// quotient in the lower 64 bits of the result. The upper 64 bits of the
179/// result are copied from the upper double-precision value of the first
180/// operand.
181///
182/// \headerfile <x86intrin.h>
183///
184/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185///
186/// \param __a
187/// A 128-bit vector of [2 x double] containing the dividend.
188/// \param __b
189/// A 128-bit vector of [2 x double] containing divisor.
190/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191/// quotient of the lower 64 bits of both operands. The upper 64 bits are
192/// copied from the upper 64 bits of the first source operand.
193static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194 __m128d __b) {
195 __a[0] /= __b[0];
196 return __a;
197}
198
199/// Performs an element-by-element division of two 128-bit vectors of
200/// [2 x double].
201///
202/// \headerfile <x86intrin.h>
203///
204/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205///
206/// \param __a
207/// A 128-bit vector of [2 x double] containing the dividend.
208/// \param __b
209/// A 128-bit vector of [2 x double] containing the divisor.
210/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211/// operands.
212static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213 __m128d __b) {
214 return (__m128d)((__v2df)__a / (__v2df)__b);
215}
216
217/// Calculates the square root of the lower double-precision value of
218/// the second operand and returns it in the lower 64 bits of the result.
219/// The upper 64 bits of the result are copied from the upper
220/// double-precision value of the first operand.
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225///
226/// \param __a
227/// A 128-bit vector of [2 x double] containing one of the operands. The
228/// upper 64 bits of this operand are copied to the upper 64 bits of the
229/// result.
230/// \param __b
231/// A 128-bit vector of [2 x double] containing one of the operands. The
232/// square root is calculated using the lower 64 bits of this operand.
233/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234/// square root of the lower 64 bits of operand \a __b, and whose upper 64
235/// bits are copied from the upper 64 bits of operand \a __a.
236static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237 __m128d __b) {
238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239 return __extension__(__m128d){__c[0], __a[1]};
240}
241
242/// Calculates the square root of the each of two values stored in a
243/// 128-bit vector of [2 x double].
244///
245/// \headerfile <x86intrin.h>
246///
247/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248///
249/// \param __a
250/// A 128-bit vector of [2 x double].
251/// \returns A 128-bit vector of [2 x double] containing the square roots of the
252/// values in the operand.
253static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254 return __builtin_ia32_sqrtpd((__v2df)__a);
255}
256
257/// Compares lower 64-bit double-precision values of both operands, and
258/// returns the lesser of the pair of values in the lower 64-bits of the
259/// result. The upper 64 bits of the result are copied from the upper
260/// double-precision value of the first operand.
261///
262/// \headerfile <x86intrin.h>
263///
264/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265///
266/// \param __a
267/// A 128-bit vector of [2 x double] containing one of the operands. The
268/// lower 64 bits of this operand are used in the comparison.
269/// \param __b
270/// A 128-bit vector of [2 x double] containing one of the operands. The
271/// lower 64 bits of this operand are used in the comparison.
272/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273/// minimum value between both operands. The upper 64 bits are copied from
274/// the upper 64 bits of the first source operand.
275static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276 __m128d __b) {
277 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278}
279
280/// Performs element-by-element comparison of the two 128-bit vectors of
281/// [2 x double] and returns the vector containing the lesser of each pair of
282/// values.
283///
284/// \headerfile <x86intrin.h>
285///
286/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287///
288/// \param __a
289/// A 128-bit vector of [2 x double] containing one of the operands.
290/// \param __b
291/// A 128-bit vector of [2 x double] containing one of the operands.
292/// \returns A 128-bit vector of [2 x double] containing the minimum values
293/// between both operands.
294static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295 __m128d __b) {
296 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297}
298
299/// Compares lower 64-bit double-precision values of both operands, and
300/// returns the greater of the pair of values in the lower 64-bits of the
301/// result. The upper 64 bits of the result are copied from the upper
302/// double-precision value of the first operand.
303///
304/// \headerfile <x86intrin.h>
305///
306/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307///
308/// \param __a
309/// A 128-bit vector of [2 x double] containing one of the operands. The
310/// lower 64 bits of this operand are used in the comparison.
311/// \param __b
312/// A 128-bit vector of [2 x double] containing one of the operands. The
313/// lower 64 bits of this operand are used in the comparison.
314/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315/// maximum value between both operands. The upper 64 bits are copied from
316/// the upper 64 bits of the first source operand.
317static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318 __m128d __b) {
319 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320}
321
322/// Performs element-by-element comparison of the two 128-bit vectors of
323/// [2 x double] and returns the vector containing the greater of each pair
324/// of values.
325///
326/// \headerfile <x86intrin.h>
327///
328/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329///
330/// \param __a
331/// A 128-bit vector of [2 x double] containing one of the operands.
332/// \param __b
333/// A 128-bit vector of [2 x double] containing one of the operands.
334/// \returns A 128-bit vector of [2 x double] containing the maximum values
335/// between both operands.
336static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337 __m128d __b) {
338 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339}
340
341/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342///
343/// \headerfile <x86intrin.h>
344///
345/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346///
347/// \param __a
348/// A 128-bit vector of [2 x double] containing one of the source operands.
349/// \param __b
350/// A 128-bit vector of [2 x double] containing one of the source operands.
351/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352/// values between both operands.
353static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354 __m128d __b) {
355 return (__m128d)((__v2du)__a & (__v2du)__b);
356}
357
358/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359/// the one's complement of the values contained in the first source operand.
360///
361/// \headerfile <x86intrin.h>
362///
363/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364///
365/// \param __a
366/// A 128-bit vector of [2 x double] containing the left source operand. The
367/// one's complement of this value is used in the bitwise AND.
368/// \param __b
369/// A 128-bit vector of [2 x double] containing the right source operand.
370/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371/// values in the second operand and the one's complement of the first
372/// operand.
373static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374 __m128d __b) {
375 return (__m128d)(~(__v2du)__a & (__v2du)__b);
376}
377
378/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379///
380/// \headerfile <x86intrin.h>
381///
382/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383///
384/// \param __a
385/// A 128-bit vector of [2 x double] containing one of the source operands.
386/// \param __b
387/// A 128-bit vector of [2 x double] containing one of the source operands.
388/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389/// values between both operands.
390static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391 __m128d __b) {
392 return (__m128d)((__v2du)__a | (__v2du)__b);
393}
394
395/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396///
397/// \headerfile <x86intrin.h>
398///
399/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400///
401/// \param __a
402/// A 128-bit vector of [2 x double] containing one of the source operands.
403/// \param __b
404/// A 128-bit vector of [2 x double] containing one of the source operands.
405/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406/// values between both operands.
407static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408 __m128d __b) {
409 return (__m128d)((__v2du)__a ^ (__v2du)__b);
410}
411
412/// Compares each of the corresponding double-precision values of the
413/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414/// for false, 0xFFFFFFFFFFFFFFFF for true.
415///
416/// \headerfile <x86intrin.h>
417///
418/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419///
420/// \param __a
421/// A 128-bit vector of [2 x double].
422/// \param __b
423/// A 128-bit vector of [2 x double].
424/// \returns A 128-bit vector containing the comparison results.
425static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426 __m128d __b) {
427 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
428}
429
430/// Compares each of the corresponding double-precision values of the
431/// 128-bit vectors of [2 x double] to determine if the values in the first
432/// operand are less than those in the second operand. Each comparison
433/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
434///
435/// \headerfile <x86intrin.h>
436///
437/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
438///
439/// \param __a
440/// A 128-bit vector of [2 x double].
441/// \param __b
442/// A 128-bit vector of [2 x double].
443/// \returns A 128-bit vector containing the comparison results.
444static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445 __m128d __b) {
446 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
447}
448
449/// Compares each of the corresponding double-precision values of the
450/// 128-bit vectors of [2 x double] to determine if the values in the first
451/// operand are less than or equal to those in the second operand.
452///
453/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454///
455/// \headerfile <x86intrin.h>
456///
457/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
458///
459/// \param __a
460/// A 128-bit vector of [2 x double].
461/// \param __b
462/// A 128-bit vector of [2 x double].
463/// \returns A 128-bit vector containing the comparison results.
464static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465 __m128d __b) {
466 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
467}
468
469/// Compares each of the corresponding double-precision values of the
470/// 128-bit vectors of [2 x double] to determine if the values in the first
471/// operand are greater than those in the second operand.
472///
473/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
474///
475/// \headerfile <x86intrin.h>
476///
477/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
478///
479/// \param __a
480/// A 128-bit vector of [2 x double].
481/// \param __b
482/// A 128-bit vector of [2 x double].
483/// \returns A 128-bit vector containing the comparison results.
484static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485 __m128d __b) {
486 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
487}
488
489/// Compares each of the corresponding double-precision values of the
490/// 128-bit vectors of [2 x double] to determine if the values in the first
491/// operand are greater than or equal to those in the second operand.
492///
493/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
494///
495/// \headerfile <x86intrin.h>
496///
497/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
498///
499/// \param __a
500/// A 128-bit vector of [2 x double].
501/// \param __b
502/// A 128-bit vector of [2 x double].
503/// \returns A 128-bit vector containing the comparison results.
504static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505 __m128d __b) {
506 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
507}
508
509/// Compares each of the corresponding double-precision values of the
510/// 128-bit vectors of [2 x double] to determine if the values in the first
511/// operand are ordered with respect to those in the second operand.
512///
513/// A pair of double-precision values are "ordered" with respect to each
514/// other if neither value is a NaN. Each comparison yields 0x0 for false,
515/// 0xFFFFFFFFFFFFFFFF for true.
516///
517/// \headerfile <x86intrin.h>
518///
519/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
520///
521/// \param __a
522/// A 128-bit vector of [2 x double].
523/// \param __b
524/// A 128-bit vector of [2 x double].
525/// \returns A 128-bit vector containing the comparison results.
526static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527 __m128d __b) {
528 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
529}
530
531/// Compares each of the corresponding double-precision values of the
532/// 128-bit vectors of [2 x double] to determine if the values in the first
533/// operand are unordered with respect to those in the second operand.
534///
535/// A pair of double-precision values are "unordered" with respect to each
536/// other if one or both values are NaN. Each comparison yields 0x0 for
537/// false, 0xFFFFFFFFFFFFFFFF for true.
538///
539/// \headerfile <x86intrin.h>
540///
541/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542/// instruction.
543///
544/// \param __a
545/// A 128-bit vector of [2 x double].
546/// \param __b
547/// A 128-bit vector of [2 x double].
548/// \returns A 128-bit vector containing the comparison results.
549static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550 __m128d __b) {
551 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
552}
553
554/// Compares each of the corresponding double-precision values of the
555/// 128-bit vectors of [2 x double] to determine if the values in the first
556/// operand are unequal to those in the second operand.
557///
558/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
559///
560/// \headerfile <x86intrin.h>
561///
562/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
563///
564/// \param __a
565/// A 128-bit vector of [2 x double].
566/// \param __b
567/// A 128-bit vector of [2 x double].
568/// \returns A 128-bit vector containing the comparison results.
569static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570 __m128d __b) {
571 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
572}
573
574/// Compares each of the corresponding double-precision values of the
575/// 128-bit vectors of [2 x double] to determine if the values in the first
576/// operand are not less than those in the second operand.
577///
578/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579///
580/// \headerfile <x86intrin.h>
581///
582/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
583///
584/// \param __a
585/// A 128-bit vector of [2 x double].
586/// \param __b
587/// A 128-bit vector of [2 x double].
588/// \returns A 128-bit vector containing the comparison results.
589static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590 __m128d __b) {
591 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
592}
593
594/// Compares each of the corresponding double-precision values of the
595/// 128-bit vectors of [2 x double] to determine if the values in the first
596/// operand are not less than or equal to those in the second operand.
597///
598/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
599///
600/// \headerfile <x86intrin.h>
601///
602/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
603///
604/// \param __a
605/// A 128-bit vector of [2 x double].
606/// \param __b
607/// A 128-bit vector of [2 x double].
608/// \returns A 128-bit vector containing the comparison results.
609static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610 __m128d __b) {
611 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
612}
613
614/// Compares each of the corresponding double-precision values of the
615/// 128-bit vectors of [2 x double] to determine if the values in the first
616/// operand are not greater than those in the second operand.
617///
618/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619///
620/// \headerfile <x86intrin.h>
621///
622/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
623///
624/// \param __a
625/// A 128-bit vector of [2 x double].
626/// \param __b
627/// A 128-bit vector of [2 x double].
628/// \returns A 128-bit vector containing the comparison results.
629static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630 __m128d __b) {
631 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
632}
633
634/// Compares each of the corresponding double-precision values of the
635/// 128-bit vectors of [2 x double] to determine if the values in the first
636/// operand are not greater than or equal to those in the second operand.
637///
638/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
639///
640/// \headerfile <x86intrin.h>
641///
642/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
643///
644/// \param __a
645/// A 128-bit vector of [2 x double].
646/// \param __b
647/// A 128-bit vector of [2 x double].
648/// \returns A 128-bit vector containing the comparison results.
649static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650 __m128d __b) {
651 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
652}
653
654/// Compares the lower double-precision floating-point values in each of
655/// the two 128-bit floating-point vectors of [2 x double] for equality.
656///
657/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658///
659/// \headerfile <x86intrin.h>
660///
661/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
662///
663/// \param __a
664/// A 128-bit vector of [2 x double]. The lower double-precision value is
665/// compared to the lower double-precision value of \a __b.
666/// \param __b
667/// A 128-bit vector of [2 x double]. The lower double-precision value is
668/// compared to the lower double-precision value of \a __a.
669/// \returns A 128-bit vector. The lower 64 bits contains the comparison
670/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672 __m128d __b) {
673 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
674}
675
676/// Compares the lower double-precision floating-point values in each of
677/// the two 128-bit floating-point vectors of [2 x double] to determine if
678/// the value in the first parameter is less than the corresponding value in
679/// the second parameter.
680///
681/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
686///
687/// \param __a
688/// A 128-bit vector of [2 x double]. The lower double-precision value is
689/// compared to the lower double-precision value of \a __b.
690/// \param __b
691/// A 128-bit vector of [2 x double]. The lower double-precision value is
692/// compared to the lower double-precision value of \a __a.
693/// \returns A 128-bit vector. The lower 64 bits contains the comparison
694/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696 __m128d __b) {
697 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
698}
699
700/// Compares the lower double-precision floating-point values in each of
701/// the two 128-bit floating-point vectors of [2 x double] to determine if
702/// the value in the first parameter is less than or equal to the
703/// corresponding value in the second parameter.
704///
705/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706///
707/// \headerfile <x86intrin.h>
708///
709/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
710///
711/// \param __a
712/// A 128-bit vector of [2 x double]. The lower double-precision value is
713/// compared to the lower double-precision value of \a __b.
714/// \param __b
715/// A 128-bit vector of [2 x double]. The lower double-precision value is
716/// compared to the lower double-precision value of \a __a.
717/// \returns A 128-bit vector. The lower 64 bits contains the comparison
718/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720 __m128d __b) {
721 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
722}
723
724/// Compares the lower double-precision floating-point values in each of
725/// the two 128-bit floating-point vectors of [2 x double] to determine if
726/// the value in the first parameter is greater than the corresponding value
727/// in the second parameter.
728///
729/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730///
731/// \headerfile <x86intrin.h>
732///
733/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
734///
735/// \param __a
736/// A 128-bit vector of [2 x double]. The lower double-precision value is
737/// compared to the lower double-precision value of \a __b.
738/// \param __b
739/// A 128-bit vector of [2 x double]. The lower double-precision value is
740/// compared to the lower double-precision value of \a __a.
741/// \returns A 128-bit vector. The lower 64 bits contains the comparison
742/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744 __m128d __b) {
745 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746 return __extension__(__m128d){__c[0], __a[1]};
747}
748
749/// Compares the lower double-precision floating-point values in each of
750/// the two 128-bit floating-point vectors of [2 x double] to determine if
751/// the value in the first parameter is greater than or equal to the
752/// corresponding value in the second parameter.
753///
754/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
755///
756/// \headerfile <x86intrin.h>
757///
758/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
759///
760/// \param __a
761/// A 128-bit vector of [2 x double]. The lower double-precision value is
762/// compared to the lower double-precision value of \a __b.
763/// \param __b
764/// A 128-bit vector of [2 x double]. The lower double-precision value is
765/// compared to the lower double-precision value of \a __a.
766/// \returns A 128-bit vector. The lower 64 bits contains the comparison
767/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769 __m128d __b) {
770 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771 return __extension__(__m128d){__c[0], __a[1]};
772}
773
774/// Compares the lower double-precision floating-point values in each of
775/// the two 128-bit floating-point vectors of [2 x double] to determine if
776/// the value in the first parameter is "ordered" with respect to the
777/// corresponding value in the second parameter.
778///
779/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780/// of double-precision values are "ordered" with respect to each other if
781/// neither value is a NaN.
782///
783/// \headerfile <x86intrin.h>
784///
785/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
786///
787/// \param __a
788/// A 128-bit vector of [2 x double]. The lower double-precision value is
789/// compared to the lower double-precision value of \a __b.
790/// \param __b
791/// A 128-bit vector of [2 x double]. The lower double-precision value is
792/// compared to the lower double-precision value of \a __a.
793/// \returns A 128-bit vector. The lower 64 bits contains the comparison
794/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796 __m128d __b) {
797 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
798}
799
800/// Compares the lower double-precision floating-point values in each of
801/// the two 128-bit floating-point vectors of [2 x double] to determine if
802/// the value in the first parameter is "unordered" with respect to the
803/// corresponding value in the second parameter.
804///
805/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806/// of double-precision values are "unordered" with respect to each other if
807/// one or both values are NaN.
808///
809/// \headerfile <x86intrin.h>
810///
811/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812/// instruction.
813///
814/// \param __a
815/// A 128-bit vector of [2 x double]. The lower double-precision value is
816/// compared to the lower double-precision value of \a __b.
817/// \param __b
818/// A 128-bit vector of [2 x double]. The lower double-precision value is
819/// compared to the lower double-precision value of \a __a.
820/// \returns A 128-bit vector. The lower 64 bits contains the comparison
821/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823 __m128d __b) {
824 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
825}
826
827/// Compares the lower double-precision floating-point values in each of
828/// the two 128-bit floating-point vectors of [2 x double] to determine if
829/// the value in the first parameter is unequal to the corresponding value in
830/// the second parameter.
831///
832/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
833///
834/// \headerfile <x86intrin.h>
835///
836/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
837///
838/// \param __a
839/// A 128-bit vector of [2 x double]. The lower double-precision value is
840/// compared to the lower double-precision value of \a __b.
841/// \param __b
842/// A 128-bit vector of [2 x double]. The lower double-precision value is
843/// compared to the lower double-precision value of \a __a.
844/// \returns A 128-bit vector. The lower 64 bits contains the comparison
845/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847 __m128d __b) {
848 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
849}
850
851/// Compares the lower double-precision floating-point values in each of
852/// the two 128-bit floating-point vectors of [2 x double] to determine if
853/// the value in the first parameter is not less than the corresponding
854/// value in the second parameter.
855///
856/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
857///
858/// \headerfile <x86intrin.h>
859///
860/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
861///
862/// \param __a
863/// A 128-bit vector of [2 x double]. The lower double-precision value is
864/// compared to the lower double-precision value of \a __b.
865/// \param __b
866/// A 128-bit vector of [2 x double]. The lower double-precision value is
867/// compared to the lower double-precision value of \a __a.
868/// \returns A 128-bit vector. The lower 64 bits contains the comparison
869/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871 __m128d __b) {
872 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
873}
874
875/// Compares the lower double-precision floating-point values in each of
876/// the two 128-bit floating-point vectors of [2 x double] to determine if
877/// the value in the first parameter is not less than or equal to the
878/// corresponding value in the second parameter.
879///
880/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881///
882/// \headerfile <x86intrin.h>
883///
884/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
885///
886/// \param __a
887/// A 128-bit vector of [2 x double]. The lower double-precision value is
888/// compared to the lower double-precision value of \a __b.
889/// \param __b
890/// A 128-bit vector of [2 x double]. The lower double-precision value is
891/// compared to the lower double-precision value of \a __a.
892/// \returns A 128-bit vector. The lower 64 bits contains the comparison
893/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895 __m128d __b) {
896 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
897}
898
899/// Compares the lower double-precision floating-point values in each of
900/// the two 128-bit floating-point vectors of [2 x double] to determine if
901/// the value in the first parameter is not greater than the corresponding
902/// value in the second parameter.
903///
904/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
905///
906/// \headerfile <x86intrin.h>
907///
908/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
909///
910/// \param __a
911/// A 128-bit vector of [2 x double]. The lower double-precision value is
912/// compared to the lower double-precision value of \a __b.
913/// \param __b
914/// A 128-bit vector of [2 x double]. The lower double-precision value is
915/// compared to the lower double-precision value of \a __a.
916/// \returns A 128-bit vector. The lower 64 bits contains the comparison
917/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919 __m128d __b) {
920 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921 return __extension__(__m128d){__c[0], __a[1]};
922}
923
924/// Compares the lower double-precision floating-point values in each of
925/// the two 128-bit floating-point vectors of [2 x double] to determine if
926/// the value in the first parameter is not greater than or equal to the
927/// corresponding value in the second parameter.
928///
929/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
930///
931/// \headerfile <x86intrin.h>
932///
933/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
934///
935/// \param __a
936/// A 128-bit vector of [2 x double]. The lower double-precision value is
937/// compared to the lower double-precision value of \a __b.
938/// \param __b
939/// A 128-bit vector of [2 x double]. The lower double-precision value is
940/// compared to the lower double-precision value of \a __a.
941/// \returns A 128-bit vector. The lower 64 bits contains the comparison
942/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944 __m128d __b) {
945 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946 return __extension__(__m128d){__c[0], __a[1]};
947}
948
949/// Compares the lower double-precision floating-point values in each of
950/// the two 128-bit floating-point vectors of [2 x double] for equality.
951///
952/// The comparison yields 0 for false, 1 for true. If either of the two
953/// lower double-precision values is NaN, 0 is returned.
954///
955/// \headerfile <x86intrin.h>
956///
957/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
958///
959/// \param __a
960/// A 128-bit vector of [2 x double]. The lower double-precision value is
961/// compared to the lower double-precision value of \a __b.
962/// \param __b
963/// A 128-bit vector of [2 x double]. The lower double-precision value is
964/// compared to the lower double-precision value of \a __a.
965/// \returns An integer containing the comparison results. If either of the two
966/// lower double-precision values is NaN, 0 is returned.
967static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968 __m128d __b) {
969 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
970}
971
972/// Compares the lower double-precision floating-point values in each of
973/// the two 128-bit floating-point vectors of [2 x double] to determine if
974/// the value in the first parameter is less than the corresponding value in
975/// the second parameter.
976///
977/// The comparison yields 0 for false, 1 for true. If either of the two
978/// lower double-precision values is NaN, 0 is returned.
979///
980/// \headerfile <x86intrin.h>
981///
982/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
983///
984/// \param __a
985/// A 128-bit vector of [2 x double]. The lower double-precision value is
986/// compared to the lower double-precision value of \a __b.
987/// \param __b
988/// A 128-bit vector of [2 x double]. The lower double-precision value is
989/// compared to the lower double-precision value of \a __a.
990/// \returns An integer containing the comparison results. If either of the two
991/// lower double-precision values is NaN, 0 is returned.
992static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993 __m128d __b) {
994 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995}
996
997/// Compares the lower double-precision floating-point values in each of
998/// the two 128-bit floating-point vectors of [2 x double] to determine if
999/// the value in the first parameter is less than or equal to the
1000/// corresponding value in the second parameter.
1001///
1002/// The comparison yields 0 for false, 1 for true. If either of the two
1003/// lower double-precision values is NaN, 0 is returned.
1004///
1005/// \headerfile <x86intrin.h>
1006///
1007/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008///
1009/// \param __a
1010/// A 128-bit vector of [2 x double]. The lower double-precision value is
1011/// compared to the lower double-precision value of \a __b.
1012/// \param __b
1013/// A 128-bit vector of [2 x double]. The lower double-precision value is
1014/// compared to the lower double-precision value of \a __a.
1015/// \returns An integer containing the comparison results. If either of the two
1016/// lower double-precision values is NaN, 0 is returned.
1017static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018 __m128d __b) {
1019 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1020}
1021
1022/// Compares the lower double-precision floating-point values in each of
1023/// the two 128-bit floating-point vectors of [2 x double] to determine if
1024/// the value in the first parameter is greater than the corresponding value
1025/// in the second parameter.
1026///
1027/// The comparison yields 0 for false, 1 for true. If either of the two
1028/// lower double-precision values is NaN, 0 is returned.
1029///
1030/// \headerfile <x86intrin.h>
1031///
1032/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1033///
1034/// \param __a
1035/// A 128-bit vector of [2 x double]. The lower double-precision value is
1036/// compared to the lower double-precision value of \a __b.
1037/// \param __b
1038/// A 128-bit vector of [2 x double]. The lower double-precision value is
1039/// compared to the lower double-precision value of \a __a.
1040/// \returns An integer containing the comparison results. If either of the two
1041/// lower double-precision values is NaN, 0 is returned.
1042static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043 __m128d __b) {
1044 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045}
1046
1047/// Compares the lower double-precision floating-point values in each of
1048/// the two 128-bit floating-point vectors of [2 x double] to determine if
1049/// the value in the first parameter is greater than or equal to the
1050/// corresponding value in the second parameter.
1051///
1052/// The comparison yields 0 for false, 1 for true. If either of the two
1053/// lower double-precision values is NaN, 0 is returned.
1054///
1055/// \headerfile <x86intrin.h>
1056///
1057/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058///
1059/// \param __a
1060/// A 128-bit vector of [2 x double]. The lower double-precision value is
1061/// compared to the lower double-precision value of \a __b.
1062/// \param __b
1063/// A 128-bit vector of [2 x double]. The lower double-precision value is
1064/// compared to the lower double-precision value of \a __a.
1065/// \returns An integer containing the comparison results. If either of the two
1066/// lower double-precision values is NaN, 0 is returned.
1067static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068 __m128d __b) {
1069 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070}
1071
1072/// Compares the lower double-precision floating-point values in each of
1073/// the two 128-bit floating-point vectors of [2 x double] to determine if
1074/// the value in the first parameter is unequal to the corresponding value in
1075/// the second parameter.
1076///
1077/// The comparison yields 0 for false, 1 for true. If either of the two
1078/// lower double-precision values is NaN, 1 is returned.
1079///
1080/// \headerfile <x86intrin.h>
1081///
1082/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1083///
1084/// \param __a
1085/// A 128-bit vector of [2 x double]. The lower double-precision value is
1086/// compared to the lower double-precision value of \a __b.
1087/// \param __b
1088/// A 128-bit vector of [2 x double]. The lower double-precision value is
1089/// compared to the lower double-precision value of \a __a.
1090/// \returns An integer containing the comparison results. If either of the two
1091/// lower double-precision values is NaN, 1 is returned.
1092static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093 __m128d __b) {
1094 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1095}
1096
1097/// Compares the lower double-precision floating-point values in each of
1098/// the two 128-bit floating-point vectors of [2 x double] for equality. The
1099/// comparison yields 0 for false, 1 for true.
1100///
1101/// If either of the two lower double-precision values is NaN, 0 is returned.
1102///
1103/// \headerfile <x86intrin.h>
1104///
1105/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1106///
1107/// \param __a
1108/// A 128-bit vector of [2 x double]. The lower double-precision value is
1109/// compared to the lower double-precision value of \a __b.
1110/// \param __b
1111/// A 128-bit vector of [2 x double]. The lower double-precision value is
1112/// compared to the lower double-precision value of \a __a.
1113/// \returns An integer containing the comparison results. If either of the two
1114/// lower double-precision values is NaN, 0 is returned.
1115static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116 __m128d __b) {
1117 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1118}
1119
1120/// Compares the lower double-precision floating-point values in each of
1121/// the two 128-bit floating-point vectors of [2 x double] to determine if
1122/// the value in the first parameter is less than the corresponding value in
1123/// the second parameter.
1124///
1125/// The comparison yields 0 for false, 1 for true. If either of the two lower
1126/// double-precision values is NaN, 0 is returned.
1127///
1128/// \headerfile <x86intrin.h>
1129///
1130/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1131///
1132/// \param __a
1133/// A 128-bit vector of [2 x double]. The lower double-precision value is
1134/// compared to the lower double-precision value of \a __b.
1135/// \param __b
1136/// A 128-bit vector of [2 x double]. The lower double-precision value is
1137/// compared to the lower double-precision value of \a __a.
1138/// \returns An integer containing the comparison results. If either of the two
1139/// lower double-precision values is NaN, 0 is returned.
1140static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141 __m128d __b) {
1142 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1143}
1144
1145/// Compares the lower double-precision floating-point values in each of
1146/// the two 128-bit floating-point vectors of [2 x double] to determine if
1147/// the value in the first parameter is less than or equal to the
1148/// corresponding value in the second parameter.
1149///
1150/// The comparison yields 0 for false, 1 for true. If either of the two lower
1151/// double-precision values is NaN, 0 is returned.
1152///
1153/// \headerfile <x86intrin.h>
1154///
1155/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1156///
1157/// \param __a
1158/// A 128-bit vector of [2 x double]. The lower double-precision value is
1159/// compared to the lower double-precision value of \a __b.
1160/// \param __b
1161/// A 128-bit vector of [2 x double]. The lower double-precision value is
1162/// compared to the lower double-precision value of \a __a.
1163/// \returns An integer containing the comparison results. If either of the two
1164/// lower double-precision values is NaN, 0 is returned.
1165static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166 __m128d __b) {
1167 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1168}
1169
1170/// Compares the lower double-precision floating-point values in each of
1171/// the two 128-bit floating-point vectors of [2 x double] to determine if
1172/// the value in the first parameter is greater than the corresponding value
1173/// in the second parameter.
1174///
1175/// The comparison yields 0 for false, 1 for true. If either of the two lower
1176/// double-precision values is NaN, 0 is returned.
1177///
1178/// \headerfile <x86intrin.h>
1179///
1180/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181///
1182/// \param __a
1183/// A 128-bit vector of [2 x double]. The lower double-precision value is
1184/// compared to the lower double-precision value of \a __b.
1185/// \param __b
1186/// A 128-bit vector of [2 x double]. The lower double-precision value is
1187/// compared to the lower double-precision value of \a __a.
1188/// \returns An integer containing the comparison results. If either of the two
1189/// lower double-precision values is NaN, 0 is returned.
1190static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191 __m128d __b) {
1192 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1193}
1194
1195/// Compares the lower double-precision floating-point values in each of
1196/// the two 128-bit floating-point vectors of [2 x double] to determine if
1197/// the value in the first parameter is greater than or equal to the
1198/// corresponding value in the second parameter.
1199///
1200/// The comparison yields 0 for false, 1 for true. If either of the two
1201/// lower double-precision values is NaN, 0 is returned.
1202///
1203/// \headerfile <x86intrin.h>
1204///
1205/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1206///
1207/// \param __a
1208/// A 128-bit vector of [2 x double]. The lower double-precision value is
1209/// compared to the lower double-precision value of \a __b.
1210/// \param __b
1211/// A 128-bit vector of [2 x double]. The lower double-precision value is
1212/// compared to the lower double-precision value of \a __a.
1213/// \returns An integer containing the comparison results. If either of the two
1214/// lower double-precision values is NaN, 0 is returned.
1215static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216 __m128d __b) {
1217 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1218}
1219
1220/// Compares the lower double-precision floating-point values in each of
1221/// the two 128-bit floating-point vectors of [2 x double] to determine if
1222/// the value in the first parameter is unequal to the corresponding value in
1223/// the second parameter.
1224///
1225/// The comparison yields 0 for false, 1 for true. If either of the two lower
1226/// double-precision values is NaN, 1 is returned.
1227///
1228/// \headerfile <x86intrin.h>
1229///
1230/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231///
1232/// \param __a
1233/// A 128-bit vector of [2 x double]. The lower double-precision value is
1234/// compared to the lower double-precision value of \a __b.
1235/// \param __b
1236/// A 128-bit vector of [2 x double]. The lower double-precision value is
1237/// compared to the lower double-precision value of \a __a.
1238/// \returns An integer containing the comparison result. If either of the two
1239/// lower double-precision values is NaN, 1 is returned.
1240static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241 __m128d __b) {
1242 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1243}
1244
1245/// Converts the two double-precision floating-point elements of a
1246/// 128-bit vector of [2 x double] into two single-precision floating-point
1247/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248/// The upper 64 bits of the result vector are set to zero.
1249///
1250/// \headerfile <x86intrin.h>
1251///
1252/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1253///
1254/// \param __a
1255/// A 128-bit vector of [2 x double].
1256/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257/// converted values. The upper 64 bits are set to zero.
1258static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1260}
1261
1262/// Converts the lower two single-precision floating-point elements of a
1263/// 128-bit vector of [4 x float] into two double-precision floating-point
1264/// values, returned in a 128-bit vector of [2 x double]. The upper two
1265/// elements of the input vector are unused.
1266///
1267/// \headerfile <x86intrin.h>
1268///
1269/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1270///
1271/// \param __a
1272/// A 128-bit vector of [4 x float]. The lower two single-precision
1273/// floating-point elements are converted to double-precision values. The
1274/// upper two elements are unused.
1275/// \returns A 128-bit vector of [2 x double] containing the converted values.
1276static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277 return (__m128d) __builtin_convertvector(
1278 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279}
1280
1281/// Converts the lower two integer elements of a 128-bit vector of
1282/// [4 x i32] into two double-precision floating-point values, returned in a
1283/// 128-bit vector of [2 x double].
1284///
1285/// The upper two elements of the input vector are unused.
1286///
1287/// \headerfile <x86intrin.h>
1288///
1289/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1290///
1291/// \param __a
1292/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293/// converted to double-precision values.
1294///
1295/// The upper two elements are unused.
1296/// \returns A 128-bit vector of [2 x double] containing the converted values.
1297static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298 return (__m128d) __builtin_convertvector(
1299 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1300}
1301
1302/// Converts the two double-precision floating-point elements of a
1303/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1304/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305/// 64 bits of the result vector are set to zero.
1306///
1307/// \headerfile <x86intrin.h>
1308///
1309/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1310///
1311/// \param __a
1312/// A 128-bit vector of [2 x double].
1313/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314/// converted values. The upper 64 bits are set to zero.
1315static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1317}
1318
1319/// Converts the low-order element of a 128-bit vector of [2 x double]
1320/// into a 32-bit signed integer value.
1321///
1322/// \headerfile <x86intrin.h>
1323///
1324/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1325///
1326/// \param __a
1327/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328/// conversion.
1329/// \returns A 32-bit signed integer containing the converted value.
1330static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331 return __builtin_ia32_cvtsd2si((__v2df)__a);
1332}
1333
1334/// Converts the lower double-precision floating-point element of a
1335/// 128-bit vector of [2 x double], in the second parameter, into a
1336/// single-precision floating-point value, returned in the lower 32 bits of a
1337/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338/// copied from the upper 96 bits of the first parameter.
1339///
1340/// \headerfile <x86intrin.h>
1341///
1342/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1343///
1344/// \param __a
1345/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346/// copied to the upper 96 bits of the result.
1347/// \param __b
1348/// A 128-bit vector of [2 x double]. The lower double-precision
1349/// floating-point element is used in the conversion.
1350/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351/// converted value from the second parameter. The upper 96 bits are copied
1352/// from the upper 96 bits of the first parameter.
1353static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354 __m128d __b) {
1355 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356}
1357
1358/// Converts a 32-bit signed integer value, in the second parameter, into
1359/// a double-precision floating-point value, returned in the lower 64 bits of
1360/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361/// are copied from the upper 64 bits of the first parameter.
1362///
1363/// \headerfile <x86intrin.h>
1364///
1365/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1366///
1367/// \param __a
1368/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369/// copied to the upper 64 bits of the result.
1370/// \param __b
1371/// A 32-bit signed integer containing the value to be converted.
1372/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373/// converted value from the second parameter. The upper 64 bits are copied
1374/// from the upper 64 bits of the first parameter.
1375static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376 int __b) {
1377 __a[0] = __b;
1378 return __a;
1379}
1380
1381/// Converts the lower single-precision floating-point element of a
1382/// 128-bit vector of [4 x float], in the second parameter, into a
1383/// double-precision floating-point value, returned in the lower 64 bits of
1384/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385/// are copied from the upper 64 bits of the first parameter.
1386///
1387/// \headerfile <x86intrin.h>
1388///
1389/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1390///
1391/// \param __a
1392/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393/// copied to the upper 64 bits of the result.
1394/// \param __b
1395/// A 128-bit vector of [4 x float]. The lower single-precision
1396/// floating-point element is used in the conversion.
1397/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398/// converted value from the second parameter. The upper 64 bits are copied
1399/// from the upper 64 bits of the first parameter.
1400static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401 __m128 __b) {
1402 __a[0] = __b[0];
1403 return __a;
1404}
1405
1406/// Converts the two double-precision floating-point elements of a
1407/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1408/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1409///
1410/// If the result of either conversion is inexact, the result is truncated
1411/// (rounded towards zero) regardless of the current MXCSR setting. The upper
1412/// 64 bits of the result vector are set to zero.
1413///
1414/// \headerfile <x86intrin.h>
1415///
1416/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417/// instruction.
1418///
1419/// \param __a
1420/// A 128-bit vector of [2 x double].
1421/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422/// converted values. The upper 64 bits are set to zero.
1423static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1425}
1426
1427/// Converts the low-order element of a [2 x double] vector into a 32-bit
1428/// signed integer value, truncating the result when it is inexact.
1429///
1430/// \headerfile <x86intrin.h>
1431///
1432/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433/// instruction.
1434///
1435/// \param __a
1436/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437/// conversion.
1438/// \returns A 32-bit signed integer containing the converted value.
1439static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440 return __builtin_ia32_cvttsd2si((__v2df)__a);
1441}
1442
1443/// Converts the two double-precision floating-point elements of a
1444/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1445/// returned in a 64-bit vector of [2 x i32].
1446///
1447/// \headerfile <x86intrin.h>
1448///
1449/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1450///
1451/// \param __a
1452/// A 128-bit vector of [2 x double].
1453/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1456}
1457
1458/// Converts the two double-precision floating-point elements of a
1459/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1460/// returned in a 64-bit vector of [2 x i32].
1461///
1462/// If the result of either conversion is inexact, the result is truncated
1463/// (rounded towards zero) regardless of the current MXCSR setting.
1464///
1465/// \headerfile <x86intrin.h>
1466///
1467/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1468///
1469/// \param __a
1470/// A 128-bit vector of [2 x double].
1471/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1474}
1475
1476/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477/// [2 x i32] into two double-precision floating-point values, returned in a
1478/// 128-bit vector of [2 x double].
1479///
1480/// \headerfile <x86intrin.h>
1481///
1482/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1483///
1484/// \param __a
1485/// A 64-bit vector of [2 x i32].
1486/// \returns A 128-bit vector of [2 x double] containing the converted values.
1487static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1489}
1490
1491/// Returns the low-order element of a 128-bit vector of [2 x double] as
1492/// a double-precision floating-point value.
1493///
1494/// \headerfile <x86intrin.h>
1495///
1496/// This intrinsic has no corresponding instruction.
1497///
1498/// \param __a
1499/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500/// \returns A double-precision floating-point value copied from the lower 64
1501/// bits of \a __a.
1502static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503 return __a[0];
1504}
1505
1506/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507/// memory location.
1508///
1509/// \headerfile <x86intrin.h>
1510///
1511/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1512///
1513/// \param __dp
1514/// A pointer to a 128-bit memory location. The address of the memory
1515/// location has to be 16-byte aligned.
1516/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518 return *(const __m128d *)__dp;
1519}
1520
1521/// Loads a double-precision floating-point value from a specified memory
1522/// location and duplicates it to both vector elements of a 128-bit vector of
1523/// [2 x double].
1524///
1525/// \headerfile <x86intrin.h>
1526///
1527/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1528///
1529/// \param __dp
1530/// A pointer to a memory location containing a double-precision value.
1531/// \returns A 128-bit vector of [2 x double] containing the loaded and
1532/// duplicated values.
1533static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534 struct __mm_load1_pd_struct {
1535 double __u;
1536 } __attribute__((__packed__, __may_alias__));
1537 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538 return __extension__(__m128d){__u, __u};
1539}
1540
1541#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1542
1543/// Loads two double-precision values, in reverse order, from an aligned
1544/// memory location into a 128-bit vector of [2 x double].
1545///
1546/// \headerfile <x86intrin.h>
1547///
1548/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1551///
1552/// \param __dp
1553/// A 16-byte aligned pointer to an array of double-precision values to be
1554/// loaded in reverse order.
1555/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556/// values.
1557static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558 __m128d __u = *(const __m128d *)__dp;
1559 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1560}
1561
1562/// Loads a 128-bit floating-point vector of [2 x double] from an
1563/// unaligned memory location.
1564///
1565/// \headerfile <x86intrin.h>
1566///
1567/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1568///
1569/// \param __dp
1570/// A pointer to a 128-bit memory location. The address of the memory
1571/// location does not have to be aligned.
1572/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574 struct __loadu_pd {
1575 __m128d_u __v;
1576 } __attribute__((__packed__, __may_alias__));
1577 return ((const struct __loadu_pd *)__dp)->__v;
1578}
1579
1580/// Loads a 64-bit integer value to the low element of a 128-bit integer
1581/// vector and clears the upper element.
1582///
1583/// \headerfile <x86intrin.h>
1584///
1585/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1586///
1587/// \param __a
1588/// A pointer to a 64-bit memory location. The address of the memory
1589/// location does not have to be aligned.
1590/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592 struct __loadu_si64 {
1593 long long __v;
1594 } __attribute__((__packed__, __may_alias__));
1595 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596 return __extension__(__m128i)(__v2di){__u, 0LL};
1597}
1598
1599/// Loads a 32-bit integer value to the low element of a 128-bit integer
1600/// vector and clears the upper element.
1601///
1602/// \headerfile <x86intrin.h>
1603///
1604/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1605///
1606/// \param __a
1607/// A pointer to a 32-bit memory location. The address of the memory
1608/// location does not have to be aligned.
1609/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611 struct __loadu_si32 {
1612 int __v;
1613 } __attribute__((__packed__, __may_alias__));
1614 int __u = ((const struct __loadu_si32 *)__a)->__v;
1615 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1616}
1617
1618/// Loads a 16-bit integer value to the low element of a 128-bit integer
1619/// vector and clears the upper element.
1620///
1621/// \headerfile <x86intrin.h>
1622///
1623/// This intrinsic does not correspond to a specific instruction.
1624///
1625/// \param __a
1626/// A pointer to a 16-bit memory location. The address of the memory
1627/// location does not have to be aligned.
1628/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630 struct __loadu_si16 {
1631 short __v;
1632 } __attribute__((__packed__, __may_alias__));
1633 short __u = ((const struct __loadu_si16 *)__a)->__v;
1634 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1635}
1636
1637/// Loads a 64-bit double-precision value to the low element of a
1638/// 128-bit integer vector and clears the upper element.
1639///
1640/// \headerfile <x86intrin.h>
1641///
1642/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1643///
1644/// \param __dp
1645/// A pointer to a memory location containing a double-precision value.
1646/// The address of the memory location does not have to be aligned.
1647/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649 struct __mm_load_sd_struct {
1650 double __u;
1651 } __attribute__((__packed__, __may_alias__));
1652 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653 return __extension__(__m128d){__u, 0};
1654}
1655
1656/// Loads a double-precision value into the high-order bits of a 128-bit
1657/// vector of [2 x double]. The low-order bits are copied from the low-order
1658/// bits of the first operand.
1659///
1660/// \headerfile <x86intrin.h>
1661///
1662/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1663///
1664/// \param __a
1665/// A 128-bit vector of [2 x double]. \n
1666/// Bits [63:0] are written to bits [63:0] of the result.
1667/// \param __dp
1668/// A pointer to a 64-bit memory location containing a double-precision
1669/// floating-point value that is loaded. The loaded value is written to bits
1670/// [127:64] of the result. The address of the memory location does not have
1671/// to be aligned.
1672/// \returns A 128-bit vector of [2 x double] containing the moved values.
1673static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674 double const *__dp) {
1675 struct __mm_loadh_pd_struct {
1676 double __u;
1677 } __attribute__((__packed__, __may_alias__));
1678 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679 return __extension__(__m128d){__a[0], __u};
1680}
1681
1682/// Loads a double-precision value into the low-order bits of a 128-bit
1683/// vector of [2 x double]. The high-order bits are copied from the
1684/// high-order bits of the first operand.
1685///
1686/// \headerfile <x86intrin.h>
1687///
1688/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1689///
1690/// \param __a
1691/// A 128-bit vector of [2 x double]. \n
1692/// Bits [127:64] are written to bits [127:64] of the result.
1693/// \param __dp
1694/// A pointer to a 64-bit memory location containing a double-precision
1695/// floating-point value that is loaded. The loaded value is written to bits
1696/// [63:0] of the result. The address of the memory location does not have to
1697/// be aligned.
1698/// \returns A 128-bit vector of [2 x double] containing the moved values.
1699static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700 double const *__dp) {
1701 struct __mm_loadl_pd_struct {
1702 double __u;
1703 } __attribute__((__packed__, __may_alias__));
1704 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705 return __extension__(__m128d){__u, __a[1]};
1706}
1707
1708/// Constructs a 128-bit floating-point vector of [2 x double] with
1709/// unspecified content. This could be used as an argument to another
1710/// intrinsic function where the argument is required but the value is not
1711/// actually used.
1712///
1713/// \headerfile <x86intrin.h>
1714///
1715/// This intrinsic has no corresponding instruction.
1716///
1717/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718/// content.
1719static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720 return (__m128d)__builtin_ia32_undef128();
1721}
1722
1723/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724/// 64 bits of the vector are initialized with the specified double-precision
1725/// floating-point value. The upper 64 bits are set to zero.
1726///
1727/// \headerfile <x86intrin.h>
1728///
1729/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1730///
1731/// \param __w
1732/// A double-precision floating-point value used to initialize the lower 64
1733/// bits of the result.
1734/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1736/// set to zero.
1737static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738 return __extension__(__m128d){__w, 0};
1739}
1740
1741/// Constructs a 128-bit floating-point vector of [2 x double], with each
1742/// of the two double-precision floating-point vector elements set to the
1743/// specified double-precision floating-point value.
1744///
1745/// \headerfile <x86intrin.h>
1746///
1747/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1748///
1749/// \param __w
1750/// A double-precision floating-point value used to initialize each vector
1751/// element of the result.
1752/// \returns An initialized 128-bit floating-point vector of [2 x double].
1753static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754 return __extension__(__m128d){__w, __w};
1755}
1756
1757/// Constructs a 128-bit floating-point vector of [2 x double], with each
1758/// of the two double-precision floating-point vector elements set to the
1759/// specified double-precision floating-point value.
1760///
1761/// \headerfile <x86intrin.h>
1762///
1763/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1764///
1765/// \param __w
1766/// A double-precision floating-point value used to initialize each vector
1767/// element of the result.
1768/// \returns An initialized 128-bit floating-point vector of [2 x double].
1769static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770 return _mm_set1_pd(__w);
1771}
1772
1773/// Constructs a 128-bit floating-point vector of [2 x double]
1774/// initialized with the specified double-precision floating-point values.
1775///
1776/// \headerfile <x86intrin.h>
1777///
1778/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1779///
1780/// \param __w
1781/// A double-precision floating-point value used to initialize the upper 64
1782/// bits of the result.
1783/// \param __x
1784/// A double-precision floating-point value used to initialize the lower 64
1785/// bits of the result.
1786/// \returns An initialized 128-bit floating-point vector of [2 x double].
1787static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788 double __x) {
1789 return __extension__(__m128d){__x, __w};
1790}
1791
1792/// Constructs a 128-bit floating-point vector of [2 x double],
1793/// initialized in reverse order with the specified double-precision
1794/// floating-point values.
1795///
1796/// \headerfile <x86intrin.h>
1797///
1798/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1799///
1800/// \param __w
1801/// A double-precision floating-point value used to initialize the lower 64
1802/// bits of the result.
1803/// \param __x
1804/// A double-precision floating-point value used to initialize the upper 64
1805/// bits of the result.
1806/// \returns An initialized 128-bit floating-point vector of [2 x double].
1807static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808 double __x) {
1809 return __extension__(__m128d){__w, __x};
1810}
1811
1812/// Constructs a 128-bit floating-point vector of [2 x double]
1813/// initialized to zero.
1814///
1815/// \headerfile <x86intrin.h>
1816///
1817/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1818///
1819/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820/// all elements set to zero.
1821static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822 return __extension__(__m128d){0.0, 0.0};
1823}
1824
1825/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1827/// 64 bits are set to the upper 64 bits of the first parameter.
1828///
1829/// \headerfile <x86intrin.h>
1830///
1831/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1832///
1833/// \param __a
1834/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835/// upper 64 bits of the result.
1836/// \param __b
1837/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838/// lower 64 bits of the result.
1839/// \returns A 128-bit vector of [2 x double] containing the moved values.
1840static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841 __m128d __b) {
1842 __a[0] = __b[0];
1843 return __a;
1844}
1845
1846/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847/// memory location.
1848///
1849/// \headerfile <x86intrin.h>
1850///
1851/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1852///
1853/// \param __dp
1854/// A pointer to a 64-bit memory location.
1855/// \param __a
1856/// A 128-bit vector of [2 x double] containing the value to be stored.
1857static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858 __m128d __a) {
1859 struct __mm_store_sd_struct {
1860 double __u;
1861 } __attribute__((__packed__, __may_alias__));
1862 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1863}
1864
1865/// Moves packed double-precision values from a 128-bit vector of
1866/// [2 x double] to a memory location.
1867///
1868/// \headerfile <x86intrin.h>
1869///
1870/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1871///
1872/// \param __dp
1873/// A pointer to an aligned memory location that can store two
1874/// double-precision values.
1875/// \param __a
1876/// A packed 128-bit vector of [2 x double] containing the values to be
1877/// moved.
1878static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879 __m128d __a) {
1880 *(__m128d *)__dp = __a;
1881}
1882
1883/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884/// the upper and lower 64 bits of a memory location.
1885///
1886/// \headerfile <x86intrin.h>
1887///
1888/// This intrinsic corresponds to the
1889/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1890///
1891/// \param __dp
1892/// A pointer to a memory location that can store two double-precision
1893/// values.
1894/// \param __a
1895/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896/// of the values in \a __dp.
1897static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898 __m128d __a) {
1899 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900 _mm_store_pd(__dp, __a);
1901}
1902
1903/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904/// the upper and lower 64 bits of a memory location.
1905///
1906/// \headerfile <x86intrin.h>
1907///
1908/// This intrinsic corresponds to the
1909/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1910///
1911/// \param __dp
1912/// A pointer to a memory location that can store two double-precision
1913/// values.
1914/// \param __a
1915/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916/// of the values in \a __dp.
1917static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918 __m128d __a) {
1919 _mm_store1_pd(__dp, __a);
1920}
1921
1922/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923/// location.
1924///
1925/// \headerfile <x86intrin.h>
1926///
1927/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1928///
1929/// \param __dp
1930/// A pointer to a 128-bit memory location. The address of the memory
1931/// location does not have to be aligned.
1932/// \param __a
1933/// A 128-bit vector of [2 x double] containing the values to be stored.
1934static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935 __m128d __a) {
1936 struct __storeu_pd {
1937 __m128d_u __v;
1938 } __attribute__((__packed__, __may_alias__));
1939 ((struct __storeu_pd *)__dp)->__v = __a;
1940}
1941
1942/// Stores two double-precision values, in reverse order, from a 128-bit
1943/// vector of [2 x double] to a 16-byte aligned memory location.
1944///
1945/// \headerfile <x86intrin.h>
1946///
1947/// This intrinsic corresponds to a shuffling instruction followed by a
1948/// <c> VMOVAPD / MOVAPD </c> instruction.
1949///
1950/// \param __dp
1951/// A pointer to a 16-byte aligned memory location that can store two
1952/// double-precision values.
1953/// \param __a
1954/// A 128-bit vector of [2 x double] containing the values to be reversed and
1955/// stored.
1956static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957 __m128d __a) {
1958 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959 *(__m128d *)__dp = __a;
1960}
1961
1962/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963/// memory location.
1964///
1965/// \headerfile <x86intrin.h>
1966///
1967/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1968///
1969/// \param __dp
1970/// A pointer to a 64-bit memory location.
1971/// \param __a
1972/// A 128-bit vector of [2 x double] containing the value to be stored.
1973static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974 __m128d __a) {
1975 struct __mm_storeh_pd_struct {
1976 double __u;
1977 } __attribute__((__packed__, __may_alias__));
1978 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1979}
1980
1981/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982/// memory location.
1983///
1984/// \headerfile <x86intrin.h>
1985///
1986/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1987///
1988/// \param __dp
1989/// A pointer to a 64-bit memory location.
1990/// \param __a
1991/// A 128-bit vector of [2 x double] containing the value to be stored.
1992static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993 __m128d __a) {
1994 struct __mm_storeh_pd_struct {
1995 double __u;
1996 } __attribute__((__packed__, __may_alias__));
1997 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1998}
1999
2000/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001/// saving the lower 8 bits of each sum in the corresponding element of a
2002/// 128-bit result vector of [16 x i8].
2003///
2004/// The integer elements of both parameters can be either signed or unsigned.
2005///
2006/// \headerfile <x86intrin.h>
2007///
2008/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2009///
2010/// \param __a
2011/// A 128-bit vector of [16 x i8].
2012/// \param __b
2013/// A 128-bit vector of [16 x i8].
2014/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015/// parameters.
2016static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017 __m128i __b) {
2018 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2019}
2020
2021/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022/// saving the lower 16 bits of each sum in the corresponding element of a
2023/// 128-bit result vector of [8 x i16].
2024///
2025/// The integer elements of both parameters can be either signed or unsigned.
2026///
2027/// \headerfile <x86intrin.h>
2028///
2029/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2030///
2031/// \param __a
2032/// A 128-bit vector of [8 x i16].
2033/// \param __b
2034/// A 128-bit vector of [8 x i16].
2035/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036/// parameters.
2037static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038 __m128i __b) {
2039 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2040}
2041
2042/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043/// saving the lower 32 bits of each sum in the corresponding element of a
2044/// 128-bit result vector of [4 x i32].
2045///
2046/// The integer elements of both parameters can be either signed or unsigned.
2047///
2048/// \headerfile <x86intrin.h>
2049///
2050/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2051///
2052/// \param __a
2053/// A 128-bit vector of [4 x i32].
2054/// \param __b
2055/// A 128-bit vector of [4 x i32].
2056/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057/// parameters.
2058static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059 __m128i __b) {
2060 return (__m128i)((__v4su)__a + (__v4su)__b);
2061}
2062
2063/// Adds two signed or unsigned 64-bit integer values, returning the
2064/// lower 64 bits of the sum.
2065///
2066/// \headerfile <x86intrin.h>
2067///
2068/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2069///
2070/// \param __a
2071/// A 64-bit integer.
2072/// \param __b
2073/// A 64-bit integer.
2074/// \returns A 64-bit integer containing the sum of both parameters.
2075static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076 __m64 __b) {
2077 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2078}
2079
2080/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081/// saving the lower 64 bits of each sum in the corresponding element of a
2082/// 128-bit result vector of [2 x i64].
2083///
2084/// The integer elements of both parameters can be either signed or unsigned.
2085///
2086/// \headerfile <x86intrin.h>
2087///
2088/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2089///
2090/// \param __a
2091/// A 128-bit vector of [2 x i64].
2092/// \param __b
2093/// A 128-bit vector of [2 x i64].
2094/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095/// parameters.
2096static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097 __m128i __b) {
2098 return (__m128i)((__v2du)__a + (__v2du)__b);
2099}
2100
2101/// Adds, with saturation, the corresponding elements of two 128-bit
2102/// signed [16 x i8] vectors, saving each sum in the corresponding element
2103/// of a 128-bit result vector of [16 x i8].
2104///
2105/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2106/// less than 0x80 are saturated to 0x80.
2107///
2108/// \headerfile <x86intrin.h>
2109///
2110/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2111///
2112/// \param __a
2113/// A 128-bit signed [16 x i8] vector.
2114/// \param __b
2115/// A 128-bit signed [16 x i8] vector.
2116/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2117/// both parameters.
2118static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2119 __m128i __b) {
2120 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2121}
2122
2123/// Adds, with saturation, the corresponding elements of two 128-bit
2124/// signed [8 x i16] vectors, saving each sum in the corresponding element
2125/// of a 128-bit result vector of [8 x i16].
2126///
2127/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2128/// less than 0x8000 are saturated to 0x8000.
2129///
2130/// \headerfile <x86intrin.h>
2131///
2132/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2133///
2134/// \param __a
2135/// A 128-bit signed [8 x i16] vector.
2136/// \param __b
2137/// A 128-bit signed [8 x i16] vector.
2138/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2139/// both parameters.
2140static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2141 __m128i __b) {
2142 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2143}
2144
2145/// Adds, with saturation, the corresponding elements of two 128-bit
2146/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2147/// of a 128-bit result vector of [16 x i8].
2148///
2149/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2150/// saturated to 0x00.
2151///
2152/// \headerfile <x86intrin.h>
2153///
2154/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2155///
2156/// \param __a
2157/// A 128-bit unsigned [16 x i8] vector.
2158/// \param __b
2159/// A 128-bit unsigned [16 x i8] vector.
2160/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2161/// of both parameters.
2162static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2163 __m128i __b) {
2164 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2165}
2166
2167/// Adds, with saturation, the corresponding elements of two 128-bit
2168/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2169/// of a 128-bit result vector of [8 x i16].
2170///
2171/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2172/// are saturated to 0x0000.
2173///
2174/// \headerfile <x86intrin.h>
2175///
2176/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2177///
2178/// \param __a
2179/// A 128-bit unsigned [8 x i16] vector.
2180/// \param __b
2181/// A 128-bit unsigned [8 x i16] vector.
2182/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2183/// of both parameters.
2184static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2185 __m128i __b) {
2186 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2187}
2188
2189/// Computes the rounded averages of corresponding elements of two
2190/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2191/// corresponding element of a 128-bit result vector of [16 x i8].
2192///
2193/// \headerfile <x86intrin.h>
2194///
2195/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2196///
2197/// \param __a
2198/// A 128-bit unsigned [16 x i8] vector.
2199/// \param __b
2200/// A 128-bit unsigned [16 x i8] vector.
2201/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2202/// averages of both parameters.
2203static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2204 __m128i __b) {
2205 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2206}
2207
2208/// Computes the rounded averages of corresponding elements of two
2209/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2210/// corresponding element of a 128-bit result vector of [8 x i16].
2211///
2212/// \headerfile <x86intrin.h>
2213///
2214/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2215///
2216/// \param __a
2217/// A 128-bit unsigned [8 x i16] vector.
2218/// \param __b
2219/// A 128-bit unsigned [8 x i16] vector.
2220/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2221/// averages of both parameters.
2222static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2223 __m128i __b) {
2224 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2225}
2226
2227/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2228/// vectors, producing eight intermediate 32-bit signed integer products, and
2229/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2230/// [4 x i32] vector.
2231///
2232/// For example, bits [15:0] of both parameters are multiplied producing a
2233/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2234/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2235/// of the result.
2236///
2237/// \headerfile <x86intrin.h>
2238///
2239/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2240///
2241/// \param __a
2242/// A 128-bit signed [8 x i16] vector.
2243/// \param __b
2244/// A 128-bit signed [8 x i16] vector.
2245/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2246/// of both parameters.
2247static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2248 __m128i __b) {
2249 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2250}
2251
2252/// Compares corresponding elements of two 128-bit signed [8 x i16]
2253/// vectors, saving the greater value from each comparison in the
2254/// corresponding element of a 128-bit result vector of [8 x i16].
2255///
2256/// \headerfile <x86intrin.h>
2257///
2258/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2259///
2260/// \param __a
2261/// A 128-bit signed [8 x i16] vector.
2262/// \param __b
2263/// A 128-bit signed [8 x i16] vector.
2264/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2265/// each comparison.
2266static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2267 __m128i __b) {
2268 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2269}
2270
2271/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2272/// vectors, saving the greater value from each comparison in the
2273/// corresponding element of a 128-bit result vector of [16 x i8].
2274///
2275/// \headerfile <x86intrin.h>
2276///
2277/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2278///
2279/// \param __a
2280/// A 128-bit unsigned [16 x i8] vector.
2281/// \param __b
2282/// A 128-bit unsigned [16 x i8] vector.
2283/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2284/// each comparison.
2285static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2286 __m128i __b) {
2287 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2288}
2289
2290/// Compares corresponding elements of two 128-bit signed [8 x i16]
2291/// vectors, saving the smaller value from each comparison in the
2292/// corresponding element of a 128-bit result vector of [8 x i16].
2293///
2294/// \headerfile <x86intrin.h>
2295///
2296/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2297///
2298/// \param __a
2299/// A 128-bit signed [8 x i16] vector.
2300/// \param __b
2301/// A 128-bit signed [8 x i16] vector.
2302/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2303/// each comparison.
2304static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2305 __m128i __b) {
2306 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2307}
2308
2309/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2310/// vectors, saving the smaller value from each comparison in the
2311/// corresponding element of a 128-bit result vector of [16 x i8].
2312///
2313/// \headerfile <x86intrin.h>
2314///
2315/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2316///
2317/// \param __a
2318/// A 128-bit unsigned [16 x i8] vector.
2319/// \param __b
2320/// A 128-bit unsigned [16 x i8] vector.
2321/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2322/// each comparison.
2323static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2324 __m128i __b) {
2325 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2326}
2327
2328/// Multiplies the corresponding elements of two signed [8 x i16]
2329/// vectors, saving the upper 16 bits of each 32-bit product in the
2330/// corresponding element of a 128-bit signed [8 x i16] result vector.
2331///
2332/// \headerfile <x86intrin.h>
2333///
2334/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2335///
2336/// \param __a
2337/// A 128-bit signed [8 x i16] vector.
2338/// \param __b
2339/// A 128-bit signed [8 x i16] vector.
2340/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2341/// each of the eight 32-bit products.
2342static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2343 __m128i __b) {
2344 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2345}
2346
2347/// Multiplies the corresponding elements of two unsigned [8 x i16]
2348/// vectors, saving the upper 16 bits of each 32-bit product in the
2349/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2350///
2351/// \headerfile <x86intrin.h>
2352///
2353/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2354///
2355/// \param __a
2356/// A 128-bit unsigned [8 x i16] vector.
2357/// \param __b
2358/// A 128-bit unsigned [8 x i16] vector.
2359/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2360/// of each of the eight 32-bit products.
2361static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2362 __m128i __b) {
2363 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2364}
2365
2366/// Multiplies the corresponding elements of two signed [8 x i16]
2367/// vectors, saving the lower 16 bits of each 32-bit product in the
2368/// corresponding element of a 128-bit signed [8 x i16] result vector.
2369///
2370/// \headerfile <x86intrin.h>
2371///
2372/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2373///
2374/// \param __a
2375/// A 128-bit signed [8 x i16] vector.
2376/// \param __b
2377/// A 128-bit signed [8 x i16] vector.
2378/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2379/// each of the eight 32-bit products.
2380static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2381 __m128i __b) {
2382 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2383}
2384
2385/// Multiplies 32-bit unsigned integer values contained in the lower bits
2386/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2387/// product.
2388///
2389/// \headerfile <x86intrin.h>
2390///
2391/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2392///
2393/// \param __a
2394/// A 64-bit integer containing one of the source operands.
2395/// \param __b
2396/// A 64-bit integer containing one of the source operands.
2397/// \returns A 64-bit integer vector containing the product of both operands.
2398static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2399 __m64 __b) {
2400 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2401}
2402
2403/// Multiplies 32-bit unsigned integer values contained in the lower
2404/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2405/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2406///
2407/// \headerfile <x86intrin.h>
2408///
2409/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2410///
2411/// \param __a
2412/// A [2 x i64] vector containing one of the source operands.
2413/// \param __b
2414/// A [2 x i64] vector containing one of the source operands.
2415/// \returns A [2 x i64] vector containing the product of both operands.
2416static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2417 __m128i __b) {
2418 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2419}
2420
2421/// Computes the absolute differences of corresponding 8-bit integer
2422/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2423/// separately sums the second 8 absolute differences. Packs these two
2424/// unsigned 16-bit integer sums into the upper and lower elements of a
2425/// [2 x i64] vector.
2426///
2427/// \headerfile <x86intrin.h>
2428///
2429/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2430///
2431/// \param __a
2432/// A 128-bit integer vector containing one of the source operands.
2433/// \param __b
2434/// A 128-bit integer vector containing one of the source operands.
2435/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2436/// differences between both operands.
2437static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2438 __m128i __b) {
2439 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2440}
2441
2442/// Subtracts the corresponding 8-bit integer values in the operands.
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2447///
2448/// \param __a
2449/// A 128-bit integer vector containing the minuends.
2450/// \param __b
2451/// A 128-bit integer vector containing the subtrahends.
2452/// \returns A 128-bit integer vector containing the differences of the values
2453/// in the operands.
2454static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2455 __m128i __b) {
2456 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2457}
2458
2459/// Subtracts the corresponding 16-bit integer values in the operands.
2460///
2461/// \headerfile <x86intrin.h>
2462///
2463/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2464///
2465/// \param __a
2466/// A 128-bit integer vector containing the minuends.
2467/// \param __b
2468/// A 128-bit integer vector containing the subtrahends.
2469/// \returns A 128-bit integer vector containing the differences of the values
2470/// in the operands.
2471static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2472 __m128i __b) {
2473 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2474}
2475
2476/// Subtracts the corresponding 32-bit integer values in the operands.
2477///
2478/// \headerfile <x86intrin.h>
2479///
2480/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2481///
2482/// \param __a
2483/// A 128-bit integer vector containing the minuends.
2484/// \param __b
2485/// A 128-bit integer vector containing the subtrahends.
2486/// \returns A 128-bit integer vector containing the differences of the values
2487/// in the operands.
2488static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2489 __m128i __b) {
2490 return (__m128i)((__v4su)__a - (__v4su)__b);
2491}
2492
2493/// Subtracts signed or unsigned 64-bit integer values and writes the
2494/// difference to the corresponding bits in the destination.
2495///
2496/// \headerfile <x86intrin.h>
2497///
2498/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2499///
2500/// \param __a
2501/// A 64-bit integer vector containing the minuend.
2502/// \param __b
2503/// A 64-bit integer vector containing the subtrahend.
2504/// \returns A 64-bit integer vector containing the difference of the values in
2505/// the operands.
2506static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2507 __m64 __b) {
2508 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2509}
2510
2511/// Subtracts the corresponding elements of two [2 x i64] vectors.
2512///
2513/// \headerfile <x86intrin.h>
2514///
2515/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2516///
2517/// \param __a
2518/// A 128-bit integer vector containing the minuends.
2519/// \param __b
2520/// A 128-bit integer vector containing the subtrahends.
2521/// \returns A 128-bit integer vector containing the differences of the values
2522/// in the operands.
2523static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2524 __m128i __b) {
2525 return (__m128i)((__v2du)__a - (__v2du)__b);
2526}
2527
2528/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2529/// the input and returns the differences in the corresponding bytes in the
2530/// destination.
2531///
2532/// Differences greater than 0x7F are saturated to 0x7F, and differences
2533/// less than 0x80 are saturated to 0x80.
2534///
2535/// \headerfile <x86intrin.h>
2536///
2537/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2538///
2539/// \param __a
2540/// A 128-bit integer vector containing the minuends.
2541/// \param __b
2542/// A 128-bit integer vector containing the subtrahends.
2543/// \returns A 128-bit integer vector containing the differences of the values
2544/// in the operands.
2545static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2546 __m128i __b) {
2547 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2548}
2549
2550/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2551/// the input and returns the differences in the corresponding bytes in the
2552/// destination.
2553///
2554/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2555/// than 0x8000 are saturated to 0x8000.
2556///
2557/// \headerfile <x86intrin.h>
2558///
2559/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2560///
2561/// \param __a
2562/// A 128-bit integer vector containing the minuends.
2563/// \param __b
2564/// A 128-bit integer vector containing the subtrahends.
2565/// \returns A 128-bit integer vector containing the differences of the values
2566/// in the operands.
2567static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2568 __m128i __b) {
2569 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2570}
2571
2572/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2573/// the input and returns the differences in the corresponding bytes in the
2574/// destination.
2575///
2576/// Differences less than 0x00 are saturated to 0x00.
2577///
2578/// \headerfile <x86intrin.h>
2579///
2580/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2581///
2582/// \param __a
2583/// A 128-bit integer vector containing the minuends.
2584/// \param __b
2585/// A 128-bit integer vector containing the subtrahends.
2586/// \returns A 128-bit integer vector containing the unsigned integer
2587/// differences of the values in the operands.
2588static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2589 __m128i __b) {
2590 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2591}
2592
2593/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2594/// the input and returns the differences in the corresponding bytes in the
2595/// destination.
2596///
2597/// Differences less than 0x0000 are saturated to 0x0000.
2598///
2599/// \headerfile <x86intrin.h>
2600///
2601/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2602///
2603/// \param __a
2604/// A 128-bit integer vector containing the minuends.
2605/// \param __b
2606/// A 128-bit integer vector containing the subtrahends.
2607/// \returns A 128-bit integer vector containing the unsigned integer
2608/// differences of the values in the operands.
2609static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2610 __m128i __b) {
2611 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2612}
2613
2614/// Performs a bitwise AND of two 128-bit integer vectors.
2615///
2616/// \headerfile <x86intrin.h>
2617///
2618/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2619///
2620/// \param __a
2621/// A 128-bit integer vector containing one of the source operands.
2622/// \param __b
2623/// A 128-bit integer vector containing one of the source operands.
2624/// \returns A 128-bit integer vector containing the bitwise AND of the values
2625/// in both operands.
2626static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2627 __m128i __b) {
2628 return (__m128i)((__v2du)__a & (__v2du)__b);
2629}
2630
2631/// Performs a bitwise AND of two 128-bit integer vectors, using the
2632/// one's complement of the values contained in the first source operand.
2633///
2634/// \headerfile <x86intrin.h>
2635///
2636/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2637///
2638/// \param __a
2639/// A 128-bit vector containing the left source operand. The one's complement
2640/// of this value is used in the bitwise AND.
2641/// \param __b
2642/// A 128-bit vector containing the right source operand.
2643/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2644/// complement of the first operand and the values in the second operand.
2645static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2646 __m128i __b) {
2647 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2648}
2649/// Performs a bitwise OR of two 128-bit integer vectors.
2650///
2651/// \headerfile <x86intrin.h>
2652///
2653/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2654///
2655/// \param __a
2656/// A 128-bit integer vector containing one of the source operands.
2657/// \param __b
2658/// A 128-bit integer vector containing one of the source operands.
2659/// \returns A 128-bit integer vector containing the bitwise OR of the values
2660/// in both operands.
2661static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2662 __m128i __b) {
2663 return (__m128i)((__v2du)__a | (__v2du)__b);
2664}
2665
2666/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2667///
2668/// \headerfile <x86intrin.h>
2669///
2670/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2671///
2672/// \param __a
2673/// A 128-bit integer vector containing one of the source operands.
2674/// \param __b
2675/// A 128-bit integer vector containing one of the source operands.
2676/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2677/// values in both operands.
2678static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2679 __m128i __b) {
2680 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2681}
2682
2683/// Left-shifts the 128-bit integer vector operand by the specified
2684/// number of bytes. Low-order bits are cleared.
2685///
2686/// \headerfile <x86intrin.h>
2687///
2688/// \code
2689/// __m128i _mm_slli_si128(__m128i a, const int imm);
2690/// \endcode
2691///
2692/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2693///
2694/// \param a
2695/// A 128-bit integer vector containing the source operand.
2696/// \param imm
2697/// An immediate value specifying the number of bytes to left-shift operand
2698/// \a a.
2699/// \returns A 128-bit integer vector containing the left-shifted value.
2700#define _mm_slli_si128(a, imm) \
2701 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2702 (int)(imm)))
2703
2704#define _mm_bslli_si128(a, imm) \
2705 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2706 (int)(imm)))
2707
2708/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2709/// by the specified number of bits. Low-order bits are cleared.
2710///
2711/// \headerfile <x86intrin.h>
2712///
2713/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2714///
2715/// \param __a
2716/// A 128-bit integer vector containing the source operand.
2717/// \param __count
2718/// An integer value specifying the number of bits to left-shift each value
2719/// in operand \a __a.
2720/// \returns A 128-bit integer vector containing the left-shifted values.
2721static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2722 int __count) {
2723 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2724}
2725
2726/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2727/// by the specified number of bits. Low-order bits are cleared.
2728///
2729/// \headerfile <x86intrin.h>
2730///
2731/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2732///
2733/// \param __a
2734/// A 128-bit integer vector containing the source operand.
2735/// \param __count
2736/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2737/// to left-shift each value in operand \a __a.
2738/// \returns A 128-bit integer vector containing the left-shifted values.
2739static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2740 __m128i __count) {
2741 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2742}
2743
2744/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2745/// by the specified number of bits. Low-order bits are cleared.
2746///
2747/// \headerfile <x86intrin.h>
2748///
2749/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2750///
2751/// \param __a
2752/// A 128-bit integer vector containing the source operand.
2753/// \param __count
2754/// An integer value specifying the number of bits to left-shift each value
2755/// in operand \a __a.
2756/// \returns A 128-bit integer vector containing the left-shifted values.
2757static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2758 int __count) {
2759 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2760}
2761
2762/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2763/// by the specified number of bits. Low-order bits are cleared.
2764///
2765/// \headerfile <x86intrin.h>
2766///
2767/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2768///
2769/// \param __a
2770/// A 128-bit integer vector containing the source operand.
2771/// \param __count
2772/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2773/// to left-shift each value in operand \a __a.
2774/// \returns A 128-bit integer vector containing the left-shifted values.
2775static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2776 __m128i __count) {
2777 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2778}
2779
2780/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2781/// by the specified number of bits. Low-order bits are cleared.
2782///
2783/// \headerfile <x86intrin.h>
2784///
2785/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2786///
2787/// \param __a
2788/// A 128-bit integer vector containing the source operand.
2789/// \param __count
2790/// An integer value specifying the number of bits to left-shift each value
2791/// in operand \a __a.
2792/// \returns A 128-bit integer vector containing the left-shifted values.
2793static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2794 int __count) {
2795 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2796}
2797
2798/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2799/// by the specified number of bits. Low-order bits are cleared.
2800///
2801/// \headerfile <x86intrin.h>
2802///
2803/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2804///
2805/// \param __a
2806/// A 128-bit integer vector containing the source operand.
2807/// \param __count
2808/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2809/// to left-shift each value in operand \a __a.
2810/// \returns A 128-bit integer vector containing the left-shifted values.
2811static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2812 __m128i __count) {
2813 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2814}
2815
2816/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2817/// by the specified number of bits. High-order bits are filled with the sign
2818/// bit of the initial value.
2819///
2820/// \headerfile <x86intrin.h>
2821///
2822/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2823///
2824/// \param __a
2825/// A 128-bit integer vector containing the source operand.
2826/// \param __count
2827/// An integer value specifying the number of bits to right-shift each value
2828/// in operand \a __a.
2829/// \returns A 128-bit integer vector containing the right-shifted values.
2830static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2831 int __count) {
2832 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2833}
2834
2835/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2836/// by the specified number of bits. High-order bits are filled with the sign
2837/// bit of the initial value.
2838///
2839/// \headerfile <x86intrin.h>
2840///
2841/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2842///
2843/// \param __a
2844/// A 128-bit integer vector containing the source operand.
2845/// \param __count
2846/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2847/// to right-shift each value in operand \a __a.
2848/// \returns A 128-bit integer vector containing the right-shifted values.
2849static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2850 __m128i __count) {
2851 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2852}
2853
2854/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2855/// by the specified number of bits. High-order bits are filled with the sign
2856/// bit of the initial value.
2857///
2858/// \headerfile <x86intrin.h>
2859///
2860/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2861///
2862/// \param __a
2863/// A 128-bit integer vector containing the source operand.
2864/// \param __count
2865/// An integer value specifying the number of bits to right-shift each value
2866/// in operand \a __a.
2867/// \returns A 128-bit integer vector containing the right-shifted values.
2868static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2869 int __count) {
2870 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2871}
2872
2873/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2874/// by the specified number of bits. High-order bits are filled with the sign
2875/// bit of the initial value.
2876///
2877/// \headerfile <x86intrin.h>
2878///
2879/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2880///
2881/// \param __a
2882/// A 128-bit integer vector containing the source operand.
2883/// \param __count
2884/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2885/// to right-shift each value in operand \a __a.
2886/// \returns A 128-bit integer vector containing the right-shifted values.
2887static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2888 __m128i __count) {
2889 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2890}
2891
2892/// Right-shifts the 128-bit integer vector operand by the specified
2893/// number of bytes. High-order bits are cleared.
2894///
2895/// \headerfile <x86intrin.h>
2896///
2897/// \code
2898/// __m128i _mm_srli_si128(__m128i a, const int imm);
2899/// \endcode
2900///
2901/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2902///
2903/// \param a
2904/// A 128-bit integer vector containing the source operand.
2905/// \param imm
2906/// An immediate value specifying the number of bytes to right-shift operand
2907/// \a a.
2908/// \returns A 128-bit integer vector containing the right-shifted value.
2909#define _mm_srli_si128(a, imm) \
2910 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2911 (int)(imm)))
2912
2913#define _mm_bsrli_si128(a, imm) \
2914 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2915 (int)(imm)))
2916
2917/// Right-shifts each of 16-bit values in the 128-bit integer vector
2918/// operand by the specified number of bits. High-order bits are cleared.
2919///
2920/// \headerfile <x86intrin.h>
2921///
2922/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2923///
2924/// \param __a
2925/// A 128-bit integer vector containing the source operand.
2926/// \param __count
2927/// An integer value specifying the number of bits to right-shift each value
2928/// in operand \a __a.
2929/// \returns A 128-bit integer vector containing the right-shifted values.
2930static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2931 int __count) {
2932 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2933}
2934
2935/// Right-shifts each of 16-bit values in the 128-bit integer vector
2936/// operand by the specified number of bits. High-order bits are cleared.
2937///
2938/// \headerfile <x86intrin.h>
2939///
2940/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2941///
2942/// \param __a
2943/// A 128-bit integer vector containing the source operand.
2944/// \param __count
2945/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2946/// to right-shift each value in operand \a __a.
2947/// \returns A 128-bit integer vector containing the right-shifted values.
2948static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2949 __m128i __count) {
2950 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2951}
2952
2953/// Right-shifts each of 32-bit values in the 128-bit integer vector
2954/// operand by the specified number of bits. High-order bits are cleared.
2955///
2956/// \headerfile <x86intrin.h>
2957///
2958/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2959///
2960/// \param __a
2961/// A 128-bit integer vector containing the source operand.
2962/// \param __count
2963/// An integer value specifying the number of bits to right-shift each value
2964/// in operand \a __a.
2965/// \returns A 128-bit integer vector containing the right-shifted values.
2966static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2967 int __count) {
2968 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2969}
2970
2971/// Right-shifts each of 32-bit values in the 128-bit integer vector
2972/// operand by the specified number of bits. High-order bits are cleared.
2973///
2974/// \headerfile <x86intrin.h>
2975///
2976/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2977///
2978/// \param __a
2979/// A 128-bit integer vector containing the source operand.
2980/// \param __count
2981/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2982/// to right-shift each value in operand \a __a.
2983/// \returns A 128-bit integer vector containing the right-shifted values.
2984static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2985 __m128i __count) {
2986 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2987}
2988
2989/// Right-shifts each of 64-bit values in the 128-bit integer vector
2990/// operand by the specified number of bits. High-order bits are cleared.
2991///
2992/// \headerfile <x86intrin.h>
2993///
2994/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2995///
2996/// \param __a
2997/// A 128-bit integer vector containing the source operand.
2998/// \param __count
2999/// An integer value specifying the number of bits to right-shift each value
3000/// in operand \a __a.
3001/// \returns A 128-bit integer vector containing the right-shifted values.
3002static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3003 int __count) {
3004 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3005}
3006
3007/// Right-shifts each of 64-bit values in the 128-bit integer vector
3008/// operand by the specified number of bits. High-order bits are cleared.
3009///
3010/// \headerfile <x86intrin.h>
3011///
3012/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3013///
3014/// \param __a
3015/// A 128-bit integer vector containing the source operand.
3016/// \param __count
3017/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3018/// to right-shift each value in operand \a __a.
3019/// \returns A 128-bit integer vector containing the right-shifted values.
3020static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3021 __m128i __count) {
3022 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3023}
3024
3025/// Compares each of the corresponding 8-bit values of the 128-bit
3026/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3027/// for true.
3028///
3029/// \headerfile <x86intrin.h>
3030///
3031/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3032///
3033/// \param __a
3034/// A 128-bit integer vector.
3035/// \param __b
3036/// A 128-bit integer vector.
3037/// \returns A 128-bit integer vector containing the comparison results.
3038static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3039 __m128i __b) {
3040 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3041}
3042
3043/// Compares each of the corresponding 16-bit values of the 128-bit
3044/// integer vectors for equality. Each comparison yields 0x0 for false,
3045/// 0xFFFF for true.
3046///
3047/// \headerfile <x86intrin.h>
3048///
3049/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3050///
3051/// \param __a
3052/// A 128-bit integer vector.
3053/// \param __b
3054/// A 128-bit integer vector.
3055/// \returns A 128-bit integer vector containing the comparison results.
3056static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3057 __m128i __b) {
3058 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3059}
3060
3061/// Compares each of the corresponding 32-bit values of the 128-bit
3062/// integer vectors for equality. Each comparison yields 0x0 for false,
3063/// 0xFFFFFFFF for true.
3064///
3065/// \headerfile <x86intrin.h>
3066///
3067/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3068///
3069/// \param __a
3070/// A 128-bit integer vector.
3071/// \param __b
3072/// A 128-bit integer vector.
3073/// \returns A 128-bit integer vector containing the comparison results.
3074static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3075 __m128i __b) {
3076 return (__m128i)((__v4si)__a == (__v4si)__b);
3077}
3078
3079/// Compares each of the corresponding signed 8-bit values of the 128-bit
3080/// integer vectors to determine if the values in the first operand are
3081/// greater than those in the second operand. Each comparison yields 0x0 for
3082/// false, 0xFF for true.
3083///
3084/// \headerfile <x86intrin.h>
3085///
3086/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3087///
3088/// \param __a
3089/// A 128-bit integer vector.
3090/// \param __b
3091/// A 128-bit integer vector.
3092/// \returns A 128-bit integer vector containing the comparison results.
3093static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3094 __m128i __b) {
3095 /* This function always performs a signed comparison, but __v16qi is a char
3096 which may be signed or unsigned, so use __v16qs. */
3097 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3098}
3099
3100/// Compares each of the corresponding signed 16-bit values of the
3101/// 128-bit integer vectors to determine if the values in the first operand
3102/// are greater than those in the second operand.
3103///
3104/// Each comparison yields 0x0 for false, 0xFFFF for true.
3105///
3106/// \headerfile <x86intrin.h>
3107///
3108/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3109///
3110/// \param __a
3111/// A 128-bit integer vector.
3112/// \param __b
3113/// A 128-bit integer vector.
3114/// \returns A 128-bit integer vector containing the comparison results.
3115static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3116 __m128i __b) {
3117 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3118}
3119
3120/// Compares each of the corresponding signed 32-bit values of the
3121/// 128-bit integer vectors to determine if the values in the first operand
3122/// are greater than those in the second operand.
3123///
3124/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3125///
3126/// \headerfile <x86intrin.h>
3127///
3128/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3129///
3130/// \param __a
3131/// A 128-bit integer vector.
3132/// \param __b
3133/// A 128-bit integer vector.
3134/// \returns A 128-bit integer vector containing the comparison results.
3135static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3136 __m128i __b) {
3137 return (__m128i)((__v4si)__a > (__v4si)__b);
3138}
3139
3140/// Compares each of the corresponding signed 8-bit values of the 128-bit
3141/// integer vectors to determine if the values in the first operand are less
3142/// than those in the second operand.
3143///
3144/// Each comparison yields 0x0 for false, 0xFF for true.
3145///
3146/// \headerfile <x86intrin.h>
3147///
3148/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3149///
3150/// \param __a
3151/// A 128-bit integer vector.
3152/// \param __b
3153/// A 128-bit integer vector.
3154/// \returns A 128-bit integer vector containing the comparison results.
3155static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3156 __m128i __b) {
3157 return _mm_cmpgt_epi8(__b, __a);
3158}
3159
3160/// Compares each of the corresponding signed 16-bit values of the
3161/// 128-bit integer vectors to determine if the values in the first operand
3162/// are less than those in the second operand.
3163///
3164/// Each comparison yields 0x0 for false, 0xFFFF for true.
3165///
3166/// \headerfile <x86intrin.h>
3167///
3168/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3169///
3170/// \param __a
3171/// A 128-bit integer vector.
3172/// \param __b
3173/// A 128-bit integer vector.
3174/// \returns A 128-bit integer vector containing the comparison results.
3175static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3176 __m128i __b) {
3177 return _mm_cmpgt_epi16(__b, __a);
3178}
3179
3180/// Compares each of the corresponding signed 32-bit values of the
3181/// 128-bit integer vectors to determine if the values in the first operand
3182/// are less than those in the second operand.
3183///
3184/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3185///
3186/// \headerfile <x86intrin.h>
3187///
3188/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3189///
3190/// \param __a
3191/// A 128-bit integer vector.
3192/// \param __b
3193/// A 128-bit integer vector.
3194/// \returns A 128-bit integer vector containing the comparison results.
3195static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3196 __m128i __b) {
3197 return _mm_cmpgt_epi32(__b, __a);
3198}
3199
3200#ifdef __x86_64__
3201/// Converts a 64-bit signed integer value from the second operand into a
3202/// double-precision value and returns it in the lower element of a [2 x
3203/// double] vector; the upper element of the returned vector is copied from
3204/// the upper element of the first operand.
3205///
3206/// \headerfile <x86intrin.h>
3207///
3208/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3209///
3210/// \param __a
3211/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3212/// copied to the upper 64 bits of the destination.
3213/// \param __b
3214/// A 64-bit signed integer operand containing the value to be converted.
3215/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3216/// converted value of the second operand. The upper 64 bits are copied from
3217/// the upper 64 bits of the first operand.
3218static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3219 long long __b) {
3220 __a[0] = __b;
3221 return __a;
3222}
3223
3224/// Converts the first (lower) element of a vector of [2 x double] into a
3225/// 64-bit signed integer value, according to the current rounding mode.
3226///
3227/// \headerfile <x86intrin.h>
3228///
3229/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3230///
3231/// \param __a
3232/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3233/// conversion.
3234/// \returns A 64-bit signed integer containing the converted value.
3235static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3236 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3237}
3238
3239/// Converts the first (lower) element of a vector of [2 x double] into a
3240/// 64-bit signed integer value, truncating the result when it is inexact.
3241///
3242/// \headerfile <x86intrin.h>
3243///
3244/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3245/// instruction.
3246///
3247/// \param __a
3248/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3249/// conversion.
3250/// \returns A 64-bit signed integer containing the converted value.
3251static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3252 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3253}
3254#endif
3255
3256/// Converts a vector of [4 x i32] into a vector of [4 x float].
3257///
3258/// \headerfile <x86intrin.h>
3259///
3260/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3261///
3262/// \param __a
3263/// A 128-bit integer vector.
3264/// \returns A 128-bit vector of [4 x float] containing the converted values.
3265static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3266 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3267}
3268
3269/// Converts a vector of [4 x float] into a vector of [4 x i32].
3270///
3271/// \headerfile <x86intrin.h>
3272///
3273/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3274///
3275/// \param __a
3276/// A 128-bit vector of [4 x float].
3277/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3278/// values.
3279static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3280 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3281}
3282
3283/// Converts a vector of [4 x float] into a vector of [4 x i32],
3284/// truncating the result when it is inexact.
3285///
3286/// \headerfile <x86intrin.h>
3287///
3288/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3289/// instruction.
3290///
3291/// \param __a
3292/// A 128-bit vector of [4 x float].
3293/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3294static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3295 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3296}
3297
3298/// Returns a vector of [4 x i32] where the lowest element is the input
3299/// operand and the remaining elements are zero.
3300///
3301/// \headerfile <x86intrin.h>
3302///
3303/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3304///
3305/// \param __a
3306/// A 32-bit signed integer operand.
3307/// \returns A 128-bit vector of [4 x i32].
3308static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3309 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3310}
3311
3312/// Returns a vector of [2 x i64] where the lower element is the input
3313/// operand and the upper element is zero.
3314///
3315/// \headerfile <x86intrin.h>
3316///
3317/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3318/// in 64-bit mode.
3319///
3320/// \param __a
3321/// A 64-bit signed integer operand containing the value to be converted.
3322/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3323static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3324 return __extension__(__m128i)(__v2di){__a, 0};
3325}
3326
3327/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3328/// 32-bit signed integer value.
3329///
3330/// \headerfile <x86intrin.h>
3331///
3332/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3333///
3334/// \param __a
3335/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3336/// destination.
3337/// \returns A 32-bit signed integer containing the moved value.
3338static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3339 __v4si __b = (__v4si)__a;
3340 return __b[0];
3341}
3342
3343/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3344/// 64-bit signed integer value.
3345///
3346/// \headerfile <x86intrin.h>
3347///
3348/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3349///
3350/// \param __a
3351/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3352/// destination.
3353/// \returns A 64-bit signed integer containing the moved value.
3354static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3355 return __a[0];
3356}
3357
3358/// Moves packed integer values from an aligned 128-bit memory location
3359/// to elements in a 128-bit integer vector.
3360///
3361/// \headerfile <x86intrin.h>
3362///
3363/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3364///
3365/// \param __p
3366/// An aligned pointer to a memory location containing integer values.
3367/// \returns A 128-bit integer vector containing the moved values.
3368static __inline__ __m128i __DEFAULT_FN_ATTRS
3369_mm_load_si128(__m128i const *__p) {
3370 return *__p;
3371}
3372
3373/// Moves packed integer values from an unaligned 128-bit memory location
3374/// to elements in a 128-bit integer vector.
3375///
3376/// \headerfile <x86intrin.h>
3377///
3378/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3379///
3380/// \param __p
3381/// A pointer to a memory location containing integer values.
3382/// \returns A 128-bit integer vector containing the moved values.
3383static __inline__ __m128i __DEFAULT_FN_ATTRS
3384_mm_loadu_si128(__m128i_u const *__p) {
3385 struct __loadu_si128 {
3386 __m128i_u __v;
3387 } __attribute__((__packed__, __may_alias__));
3388 return ((const struct __loadu_si128 *)__p)->__v;
3389}
3390
3391/// Returns a vector of [2 x i64] where the lower element is taken from
3392/// the lower element of the operand, and the upper element is zero.
3393///
3394/// \headerfile <x86intrin.h>
3395///
3396/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3397///
3398/// \param __p
3399/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3400/// the destination.
3401/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3402/// moved value. The higher order bits are cleared.
3403static __inline__ __m128i __DEFAULT_FN_ATTRS
3404_mm_loadl_epi64(__m128i_u const *__p) {
3405 struct __mm_loadl_epi64_struct {
3406 long long __u;
3407 } __attribute__((__packed__, __may_alias__));
3408 return __extension__(__m128i){
3409 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3410}
3411
3412/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3413/// This could be used as an argument to another intrinsic function where the
3414/// argument is required but the value is not actually used.
3415///
3416/// \headerfile <x86intrin.h>
3417///
3418/// This intrinsic has no corresponding instruction.
3419///
3420/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3421static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3422 return (__m128i)__builtin_ia32_undef128();
3423}
3424
3425/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3426/// the specified 64-bit integer values.
3427///
3428/// \headerfile <x86intrin.h>
3429///
3430/// This intrinsic is a utility function and does not correspond to a specific
3431/// instruction.
3432///
3433/// \param __q1
3434/// A 64-bit integer value used to initialize the upper 64 bits of the
3435/// destination vector of [2 x i64].
3436/// \param __q0
3437/// A 64-bit integer value used to initialize the lower 64 bits of the
3438/// destination vector of [2 x i64].
3439/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3440/// provided in the operands.
3441static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3442 long long __q0) {
3443 return __extension__(__m128i)(__v2di){__q0, __q1};
3444}
3445
3446/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3447/// the specified 64-bit integer values.
3448///
3449/// \headerfile <x86intrin.h>
3450///
3451/// This intrinsic is a utility function and does not correspond to a specific
3452/// instruction.
3453///
3454/// \param __q1
3455/// A 64-bit integer value used to initialize the upper 64 bits of the
3456/// destination vector of [2 x i64].
3457/// \param __q0
3458/// A 64-bit integer value used to initialize the lower 64 bits of the
3459/// destination vector of [2 x i64].
3460/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3461/// provided in the operands.
3462static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3463 __m64 __q0) {
3464 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3465}
3466
3467/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3468/// the specified 32-bit integer values.
3469///
3470/// \headerfile <x86intrin.h>
3471///
3472/// This intrinsic is a utility function and does not correspond to a specific
3473/// instruction.
3474///
3475/// \param __i3
3476/// A 32-bit integer value used to initialize bits [127:96] of the
3477/// destination vector.
3478/// \param __i2
3479/// A 32-bit integer value used to initialize bits [95:64] of the destination
3480/// vector.
3481/// \param __i1
3482/// A 32-bit integer value used to initialize bits [63:32] of the destination
3483/// vector.
3484/// \param __i0
3485/// A 32-bit integer value used to initialize bits [31:0] of the destination
3486/// vector.
3487/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3488/// provided in the operands.
3489static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3490 int __i1, int __i0) {
3491 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3492}
3493
3494/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3495/// the specified 16-bit integer values.
3496///
3497/// \headerfile <x86intrin.h>
3498///
3499/// This intrinsic is a utility function and does not correspond to a specific
3500/// instruction.
3501///
3502/// \param __w7
3503/// A 16-bit integer value used to initialize bits [127:112] of the
3504/// destination vector.
3505/// \param __w6
3506/// A 16-bit integer value used to initialize bits [111:96] of the
3507/// destination vector.
3508/// \param __w5
3509/// A 16-bit integer value used to initialize bits [95:80] of the destination
3510/// vector.
3511/// \param __w4
3512/// A 16-bit integer value used to initialize bits [79:64] of the destination
3513/// vector.
3514/// \param __w3
3515/// A 16-bit integer value used to initialize bits [63:48] of the destination
3516/// vector.
3517/// \param __w2
3518/// A 16-bit integer value used to initialize bits [47:32] of the destination
3519/// vector.
3520/// \param __w1
3521/// A 16-bit integer value used to initialize bits [31:16] of the destination
3522/// vector.
3523/// \param __w0
3524/// A 16-bit integer value used to initialize bits [15:0] of the destination
3525/// vector.
3526/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3527/// provided in the operands.
3528static __inline__ __m128i __DEFAULT_FN_ATTRS
3529_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3530 short __w2, short __w1, short __w0) {
3531 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3532 __w4, __w5, __w6, __w7};
3533}
3534
3535/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3536/// the specified 8-bit integer values.
3537///
3538/// \headerfile <x86intrin.h>
3539///
3540/// This intrinsic is a utility function and does not correspond to a specific
3541/// instruction.
3542///
3543/// \param __b15
3544/// Initializes bits [127:120] of the destination vector.
3545/// \param __b14
3546/// Initializes bits [119:112] of the destination vector.
3547/// \param __b13
3548/// Initializes bits [111:104] of the destination vector.
3549/// \param __b12
3550/// Initializes bits [103:96] of the destination vector.
3551/// \param __b11
3552/// Initializes bits [95:88] of the destination vector.
3553/// \param __b10
3554/// Initializes bits [87:80] of the destination vector.
3555/// \param __b9
3556/// Initializes bits [79:72] of the destination vector.
3557/// \param __b8
3558/// Initializes bits [71:64] of the destination vector.
3559/// \param __b7
3560/// Initializes bits [63:56] of the destination vector.
3561/// \param __b6
3562/// Initializes bits [55:48] of the destination vector.
3563/// \param __b5
3564/// Initializes bits [47:40] of the destination vector.
3565/// \param __b4
3566/// Initializes bits [39:32] of the destination vector.
3567/// \param __b3
3568/// Initializes bits [31:24] of the destination vector.
3569/// \param __b2
3570/// Initializes bits [23:16] of the destination vector.
3571/// \param __b1
3572/// Initializes bits [15:8] of the destination vector.
3573/// \param __b0
3574/// Initializes bits [7:0] of the destination vector.
3575/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3576/// provided in the operands.
3577static __inline__ __m128i __DEFAULT_FN_ATTRS
3578_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3579 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3580 char __b4, char __b3, char __b2, char __b1, char __b0) {
3581 return __extension__(__m128i)(__v16qi){
3582 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3583 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3584}
3585
3586/// Initializes both values in a 128-bit integer vector with the
3587/// specified 64-bit integer value.
3588///
3589/// \headerfile <x86intrin.h>
3590///
3591/// This intrinsic is a utility function and does not correspond to a specific
3592/// instruction.
3593///
3594/// \param __q
3595/// Integer value used to initialize the elements of the destination integer
3596/// vector.
3597/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3598/// elements containing the value provided in the operand.
3599static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3600 return _mm_set_epi64x(__q, __q);
3601}
3602
3603/// Initializes both values in a 128-bit vector of [2 x i64] with the
3604/// specified 64-bit value.
3605///
3606/// \headerfile <x86intrin.h>
3607///
3608/// This intrinsic is a utility function and does not correspond to a specific
3609/// instruction.
3610///
3611/// \param __q
3612/// A 64-bit value used to initialize the elements of the destination integer
3613/// vector.
3614/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3615/// containing the value provided in the operand.
3616static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3617 return _mm_set_epi64(__q, __q);
3618}
3619
3620/// Initializes all values in a 128-bit vector of [4 x i32] with the
3621/// specified 32-bit value.
3622///
3623/// \headerfile <x86intrin.h>
3624///
3625/// This intrinsic is a utility function and does not correspond to a specific
3626/// instruction.
3627///
3628/// \param __i
3629/// A 32-bit value used to initialize the elements of the destination integer
3630/// vector.
3631/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3632/// containing the value provided in the operand.
3633static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3634 return _mm_set_epi32(__i, __i, __i, __i);
3635}
3636
3637/// Initializes all values in a 128-bit vector of [8 x i16] with the
3638/// specified 16-bit value.
3639///
3640/// \headerfile <x86intrin.h>
3641///
3642/// This intrinsic is a utility function and does not correspond to a specific
3643/// instruction.
3644///
3645/// \param __w
3646/// A 16-bit value used to initialize the elements of the destination integer
3647/// vector.
3648/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3649/// containing the value provided in the operand.
3650static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3651 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3652}
3653
3654/// Initializes all values in a 128-bit vector of [16 x i8] with the
3655/// specified 8-bit value.
3656///
3657/// \headerfile <x86intrin.h>
3658///
3659/// This intrinsic is a utility function and does not correspond to a specific
3660/// instruction.
3661///
3662/// \param __b
3663/// An 8-bit value used to initialize the elements of the destination integer
3664/// vector.
3665/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3666/// containing the value provided in the operand.
3667static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3668 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3669 __b, __b, __b, __b, __b);
3670}
3671
3672/// Constructs a 128-bit integer vector, initialized in reverse order
3673/// with the specified 64-bit integral values.
3674///
3675/// \headerfile <x86intrin.h>
3676///
3677/// This intrinsic does not correspond to a specific instruction.
3678///
3679/// \param __q0
3680/// A 64-bit integral value used to initialize the lower 64 bits of the
3681/// result.
3682/// \param __q1
3683/// A 64-bit integral value used to initialize the upper 64 bits of the
3684/// result.
3685/// \returns An initialized 128-bit integer vector.
3686static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3687 __m64 __q1) {
3688 return _mm_set_epi64(__q1, __q0);
3689}
3690
3691/// Constructs a 128-bit integer vector, initialized in reverse order
3692/// with the specified 32-bit integral values.
3693///
3694/// \headerfile <x86intrin.h>
3695///
3696/// This intrinsic is a utility function and does not correspond to a specific
3697/// instruction.
3698///
3699/// \param __i0
3700/// A 32-bit integral value used to initialize bits [31:0] of the result.
3701/// \param __i1
3702/// A 32-bit integral value used to initialize bits [63:32] of the result.
3703/// \param __i2
3704/// A 32-bit integral value used to initialize bits [95:64] of the result.
3705/// \param __i3
3706/// A 32-bit integral value used to initialize bits [127:96] of the result.
3707/// \returns An initialized 128-bit integer vector.
3708static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3709 int __i2,
3710 int __i3) {
3711 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3712}
3713
3714/// Constructs a 128-bit integer vector, initialized in reverse order
3715/// with the specified 16-bit integral values.
3716///
3717/// \headerfile <x86intrin.h>
3718///
3719/// This intrinsic is a utility function and does not correspond to a specific
3720/// instruction.
3721///
3722/// \param __w0
3723/// A 16-bit integral value used to initialize bits [15:0] of the result.
3724/// \param __w1
3725/// A 16-bit integral value used to initialize bits [31:16] of the result.
3726/// \param __w2
3727/// A 16-bit integral value used to initialize bits [47:32] of the result.
3728/// \param __w3
3729/// A 16-bit integral value used to initialize bits [63:48] of the result.
3730/// \param __w4
3731/// A 16-bit integral value used to initialize bits [79:64] of the result.
3732/// \param __w5
3733/// A 16-bit integral value used to initialize bits [95:80] of the result.
3734/// \param __w6
3735/// A 16-bit integral value used to initialize bits [111:96] of the result.
3736/// \param __w7
3737/// A 16-bit integral value used to initialize bits [127:112] of the result.
3738/// \returns An initialized 128-bit integer vector.
3739static __inline__ __m128i __DEFAULT_FN_ATTRS
3740_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3741 short __w5, short __w6, short __w7) {
3742 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3743}
3744
3745/// Constructs a 128-bit integer vector, initialized in reverse order
3746/// with the specified 8-bit integral values.
3747///
3748/// \headerfile <x86intrin.h>
3749///
3750/// This intrinsic is a utility function and does not correspond to a specific
3751/// instruction.
3752///
3753/// \param __b0
3754/// An 8-bit integral value used to initialize bits [7:0] of the result.
3755/// \param __b1
3756/// An 8-bit integral value used to initialize bits [15:8] of the result.
3757/// \param __b2
3758/// An 8-bit integral value used to initialize bits [23:16] of the result.
3759/// \param __b3
3760/// An 8-bit integral value used to initialize bits [31:24] of the result.
3761/// \param __b4
3762/// An 8-bit integral value used to initialize bits [39:32] of the result.
3763/// \param __b5
3764/// An 8-bit integral value used to initialize bits [47:40] of the result.
3765/// \param __b6
3766/// An 8-bit integral value used to initialize bits [55:48] of the result.
3767/// \param __b7
3768/// An 8-bit integral value used to initialize bits [63:56] of the result.
3769/// \param __b8
3770/// An 8-bit integral value used to initialize bits [71:64] of the result.
3771/// \param __b9
3772/// An 8-bit integral value used to initialize bits [79:72] of the result.
3773/// \param __b10
3774/// An 8-bit integral value used to initialize bits [87:80] of the result.
3775/// \param __b11
3776/// An 8-bit integral value used to initialize bits [95:88] of the result.
3777/// \param __b12
3778/// An 8-bit integral value used to initialize bits [103:96] of the result.
3779/// \param __b13
3780/// An 8-bit integral value used to initialize bits [111:104] of the result.
3781/// \param __b14
3782/// An 8-bit integral value used to initialize bits [119:112] of the result.
3783/// \param __b15
3784/// An 8-bit integral value used to initialize bits [127:120] of the result.
3785/// \returns An initialized 128-bit integer vector.
3786static __inline__ __m128i __DEFAULT_FN_ATTRS
3787_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3788 char __b6, char __b7, char __b8, char __b9, char __b10,
3789 char __b11, char __b12, char __b13, char __b14, char __b15) {
3790 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3791 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3792}
3793
3794/// Creates a 128-bit integer vector initialized to zero.
3795///
3796/// \headerfile <x86intrin.h>
3797///
3798/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3799///
3800/// \returns An initialized 128-bit integer vector with all elements set to
3801/// zero.
3802static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3803 return __extension__(__m128i)(__v2di){0LL, 0LL};
3804}
3805
3806/// Stores a 128-bit integer vector to a memory location aligned on a
3807/// 128-bit boundary.
3808///
3809/// \headerfile <x86intrin.h>
3810///
3811/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3812///
3813/// \param __p
3814/// A pointer to an aligned memory location that will receive the integer
3815/// values.
3816/// \param __b
3817/// A 128-bit integer vector containing the values to be moved.
3818static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3819 __m128i __b) {
3820 *__p = __b;
3821}
3822
3823/// Stores a 128-bit integer vector to an unaligned memory location.
3824///
3825/// \headerfile <x86intrin.h>
3826///
3827/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3828///
3829/// \param __p
3830/// A pointer to a memory location that will receive the integer values.
3831/// \param __b
3832/// A 128-bit integer vector containing the values to be moved.
3833static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3834 __m128i __b) {
3835 struct __storeu_si128 {
3836 __m128i_u __v;
3837 } __attribute__((__packed__, __may_alias__));
3838 ((struct __storeu_si128 *)__p)->__v = __b;
3839}
3840
3841/// Stores a 64-bit integer value from the low element of a 128-bit integer
3842/// vector.
3843///
3844/// \headerfile <x86intrin.h>
3845///
3846/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3847///
3848/// \param __p
3849/// A pointer to a 64-bit memory location. The address of the memory
3850/// location does not have to be aligned.
3851/// \param __b
3852/// A 128-bit integer vector containing the value to be stored.
3853static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3854 __m128i __b) {
3855 struct __storeu_si64 {
3856 long long __v;
3857 } __attribute__((__packed__, __may_alias__));
3858 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3859}
3860
3861/// Stores a 32-bit integer value from the low element of a 128-bit integer
3862/// vector.
3863///
3864/// \headerfile <x86intrin.h>
3865///
3866/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3867///
3868/// \param __p
3869/// A pointer to a 32-bit memory location. The address of the memory
3870/// location does not have to be aligned.
3871/// \param __b
3872/// A 128-bit integer vector containing the value to be stored.
3873static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3874 __m128i __b) {
3875 struct __storeu_si32 {
3876 int __v;
3877 } __attribute__((__packed__, __may_alias__));
3878 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3879}
3880
3881/// Stores a 16-bit integer value from the low element of a 128-bit integer
3882/// vector.
3883///
3884/// \headerfile <x86intrin.h>
3885///
3886/// This intrinsic does not correspond to a specific instruction.
3887///
3888/// \param __p
3889/// A pointer to a 16-bit memory location. The address of the memory
3890/// location does not have to be aligned.
3891/// \param __b
3892/// A 128-bit integer vector containing the value to be stored.
3893static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3894 __m128i __b) {
3895 struct __storeu_si16 {
3896 short __v;
3897 } __attribute__((__packed__, __may_alias__));
3898 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3899}
3900
3901/// Moves bytes selected by the mask from the first operand to the
3902/// specified unaligned memory location. When a mask bit is 1, the
3903/// corresponding byte is written, otherwise it is not written.
3904///
3905/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3906/// used again soon). Exception and trap behavior for elements not selected
3907/// for storage to memory are implementation dependent.
3908///
3909/// \headerfile <x86intrin.h>
3910///
3911/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3912/// instruction.
3913///
3914/// \param __d
3915/// A 128-bit integer vector containing the values to be moved.
3916/// \param __n
3917/// A 128-bit integer vector containing the mask. The most significant bit of
3918/// each byte represents the mask bits.
3919/// \param __p
3920/// A pointer to an unaligned 128-bit memory location where the specified
3921/// values are moved.
3922static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3923 __m128i __n,
3924 char *__p) {
3925 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3926}
3927
3928/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3929/// a memory location.
3930///
3931/// \headerfile <x86intrin.h>
3932///
3933/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3934///
3935/// \param __p
3936/// A pointer to a 64-bit memory location that will receive the lower 64 bits
3937/// of the integer vector parameter.
3938/// \param __a
3939/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3940/// value to be stored.
3941static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3942 __m128i __a) {
3943 struct __mm_storel_epi64_struct {
3944 long long __u;
3945 } __attribute__((__packed__, __may_alias__));
3946 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3947}
3948
3949/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3950/// aligned memory location.
3951///
3952/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3953/// used again soon).
3954///
3955/// \headerfile <x86intrin.h>
3956///
3957/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3958///
3959/// \param __p
3960/// A pointer to the 128-bit aligned memory location used to store the value.
3961/// \param __a
3962/// A vector of [2 x double] containing the 64-bit values to be stored.
3963static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3964 __m128d __a) {
3965 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3966}
3967
3968/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3969///
3970/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3971/// used again soon).
3972///
3973/// \headerfile <x86intrin.h>
3974///
3975/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3976///
3977/// \param __p
3978/// A pointer to the 128-bit aligned memory location used to store the value.
3979/// \param __a
3980/// A 128-bit integer vector containing the values to be stored.
3981static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3982 __m128i __a) {
3983 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3984}
3985
3986/// Stores a 32-bit integer value in the specified memory location.
3987///
3988/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3989/// used again soon).
3990///
3991/// \headerfile <x86intrin.h>
3992///
3993/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3994///
3995/// \param __p
3996/// A pointer to the 32-bit memory location used to store the value.
3997/// \param __a
3998/// A 32-bit integer containing the value to be stored.
3999static __inline__ void
4000 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4001 _mm_stream_si32(void *__p, int __a) {
4002 __builtin_ia32_movnti((int *)__p, __a);
4003}
4004
4005#ifdef __x86_64__
4006/// Stores a 64-bit integer value in the specified memory location.
4007///
4008/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4009/// used again soon).
4010///
4011/// \headerfile <x86intrin.h>
4012///
4013/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4014///
4015/// \param __p
4016/// A pointer to the 64-bit memory location used to store the value.
4017/// \param __a
4018/// A 64-bit integer containing the value to be stored.
4019static __inline__ void
4020 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4021 _mm_stream_si64(void *__p, long long __a) {
4022 __builtin_ia32_movnti64((long long *)__p, __a);
4023}
4024#endif
4025
4026#if defined(__cplusplus)
4027extern "C" {
4028#endif
4029
4030/// The cache line containing \a __p is flushed and invalidated from all
4031/// caches in the coherency domain.
4032///
4033/// \headerfile <x86intrin.h>
4034///
4035/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4036///
4037/// \param __p
4038/// A pointer to the memory location used to identify the cache line to be
4039/// flushed.
4040void _mm_clflush(void const *__p);
4041
4042/// Forces strong memory ordering (serialization) between load
4043/// instructions preceding this instruction and load instructions following
4044/// this instruction, ensuring the system completes all previous loads before
4045/// executing subsequent loads.
4046///
4047/// \headerfile <x86intrin.h>
4048///
4049/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4050///
4051void _mm_lfence(void);
4052
4053/// Forces strong memory ordering (serialization) between load and store
4054/// instructions preceding this instruction and load and store instructions
4055/// following this instruction, ensuring that the system completes all
4056/// previous memory accesses before executing subsequent memory accesses.
4057///
4058/// \headerfile <x86intrin.h>
4059///
4060/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4061///
4062void _mm_mfence(void);
4063
4064#if defined(__cplusplus)
4065} // extern "C"
4066#endif
4067
4068/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4069/// vector operands into 8-bit signed integers, and packs the results into
4070/// the destination.
4071///
4072/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4073/// less than 0x80 are saturated to 0x80.
4074///
4075/// \headerfile <x86intrin.h>
4076///
4077/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4078///
4079/// \param __a
4080/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4081/// written to the lower 64 bits of the result.
4082/// \param __b
4083/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4084/// written to the higher 64 bits of the result.
4085/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4086static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4087 __m128i __b) {
4088 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4089}
4090
4091/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4092/// vector operands into 16-bit signed integers, and packs the results into
4093/// the destination.
4094///
4095/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4096/// values less than 0x8000 are saturated to 0x8000.
4097///
4098/// \headerfile <x86intrin.h>
4099///
4100/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4101///
4102/// \param __a
4103/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4104/// are written to the lower 64 bits of the result.
4105/// \param __b
4106/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4107/// are written to the higher 64 bits of the result.
4108/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4109static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4110 __m128i __b) {
4111 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4112}
4113
4114/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4115/// vector operands into 8-bit unsigned integers, and packs the results into
4116/// the destination.
4117///
4118/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4119/// are saturated to 0x00.
4120///
4121/// \headerfile <x86intrin.h>
4122///
4123/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4124///
4125/// \param __a
4126/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4127/// written to the lower 64 bits of the result.
4128/// \param __b
4129/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4130/// written to the higher 64 bits of the result.
4131/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4132static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4133 __m128i __b) {
4134 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4135}
4136
4137/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4138/// the immediate-value parameter as a selector.
4139///
4140/// \headerfile <x86intrin.h>
4141///
4142/// \code
4143/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4144/// \endcode
4145///
4146/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4147///
4148/// \param a
4149/// A 128-bit integer vector.
4150/// \param imm
4151/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4152/// to bits[15:0] of the result. \n
4153/// 000: assign values from bits [15:0] of \a a. \n
4154/// 001: assign values from bits [31:16] of \a a. \n
4155/// 010: assign values from bits [47:32] of \a a. \n
4156/// 011: assign values from bits [63:48] of \a a. \n
4157/// 100: assign values from bits [79:64] of \a a. \n
4158/// 101: assign values from bits [95:80] of \a a. \n
4159/// 110: assign values from bits [111:96] of \a a. \n
4160/// 111: assign values from bits [127:112] of \a a.
4161/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4162/// integer vector parameter and the remaining bits are assigned zeros.
4163#define _mm_extract_epi16(a, imm) \
4164 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4165 (int)(imm)))
4166
4167/// Constructs a 128-bit integer vector by first making a copy of the
4168/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4169/// of an integer parameter into an offset specified by the immediate-value
4170/// parameter.
4171///
4172/// \headerfile <x86intrin.h>
4173///
4174/// \code
4175/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4176/// \endcode
4177///
4178/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4179///
4180/// \param a
4181/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4182/// result and then one of the eight elements in the result is replaced by
4183/// the lower 16 bits of \a b.
4184/// \param b
4185/// An integer. The lower 16 bits of this parameter are written to the
4186/// result beginning at an offset specified by \a imm.
4187/// \param imm
4188/// An immediate value specifying the bit offset in the result at which the
4189/// lower 16 bits of \a b are written.
4190/// \returns A 128-bit integer vector containing the constructed values.
4191#define _mm_insert_epi16(a, b, imm) \
4192 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4193 (int)(imm)))
4194
4195/// Copies the values of the most significant bits from each 8-bit
4196/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4197/// value, zero-extends the value, and writes it to the destination.
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4202///
4203/// \param __a
4204/// A 128-bit integer vector containing the values with bits to be extracted.
4205/// \returns The most significant bits from each 8-bit element in \a __a,
4206/// written to bits [15:0]. The other bits are assigned zeros.
4207static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4208 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4209}
4210
4211/// Constructs a 128-bit integer vector by shuffling four 32-bit
4212/// elements of a 128-bit integer vector parameter, using the immediate-value
4213/// parameter as a specifier.
4214///
4215/// \headerfile <x86intrin.h>
4216///
4217/// \code
4218/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4219/// \endcode
4220///
4221/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4222///
4223/// \param a
4224/// A 128-bit integer vector containing the values to be copied.
4225/// \param imm
4226/// An immediate value containing an 8-bit value specifying which elements to
4227/// copy from a. The destinations within the 128-bit destination are assigned
4228/// values as follows: \n
4229/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4230/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4231/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4232/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4233/// Bit value assignments: \n
4234/// 00: assign values from bits [31:0] of \a a. \n
4235/// 01: assign values from bits [63:32] of \a a. \n
4236/// 10: assign values from bits [95:64] of \a a. \n
4237/// 11: assign values from bits [127:96] of \a a. \n
4238/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4239/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4240/// <c>[b6, b4, b2, b0]</c>.
4241/// \returns A 128-bit integer vector containing the shuffled values.
4242#define _mm_shuffle_epi32(a, imm) \
4243 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4244
4245/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4246/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4247/// value parameter as a specifier.
4248///
4249/// \headerfile <x86intrin.h>
4250///
4251/// \code
4252/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4253/// \endcode
4254///
4255/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4256///
4257/// \param a
4258/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4259/// [127:64] of the result.
4260/// \param imm
4261/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4262/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4263/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4264/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4265/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4266/// Bit value assignments: \n
4267/// 00: assign values from bits [15:0] of \a a. \n
4268/// 01: assign values from bits [31:16] of \a a. \n
4269/// 10: assign values from bits [47:32] of \a a. \n
4270/// 11: assign values from bits [63:48] of \a a. \n
4271/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4272/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4273/// <c>[b6, b4, b2, b0]</c>.
4274/// \returns A 128-bit integer vector containing the shuffled values.
4275#define _mm_shufflelo_epi16(a, imm) \
4276 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4277
4278/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4279/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4280/// value parameter as a specifier.
4281///
4282/// \headerfile <x86intrin.h>
4283///
4284/// \code
4285/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4286/// \endcode
4287///
4288/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4289///
4290/// \param a
4291/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4292/// [63:0] of the result.
4293/// \param imm
4294/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4295/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4296/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4297/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4298/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4299/// Bit value assignments: \n
4300/// 00: assign values from bits [79:64] of \a a. \n
4301/// 01: assign values from bits [95:80] of \a a. \n
4302/// 10: assign values from bits [111:96] of \a a. \n
4303/// 11: assign values from bits [127:112] of \a a. \n
4304/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4305/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4306/// <c>[b6, b4, b2, b0]</c>.
4307/// \returns A 128-bit integer vector containing the shuffled values.
4308#define _mm_shufflehi_epi16(a, imm) \
4309 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4310
4311/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4312/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4313///
4314/// \headerfile <x86intrin.h>
4315///
4316/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4317/// instruction.
4318///
4319/// \param __a
4320/// A 128-bit vector of [16 x i8].
4321/// Bits [71:64] are written to bits [7:0] of the result. \n
4322/// Bits [79:72] are written to bits [23:16] of the result. \n
4323/// Bits [87:80] are written to bits [39:32] of the result. \n
4324/// Bits [95:88] are written to bits [55:48] of the result. \n
4325/// Bits [103:96] are written to bits [71:64] of the result. \n
4326/// Bits [111:104] are written to bits [87:80] of the result. \n
4327/// Bits [119:112] are written to bits [103:96] of the result. \n
4328/// Bits [127:120] are written to bits [119:112] of the result.
4329/// \param __b
4330/// A 128-bit vector of [16 x i8]. \n
4331/// Bits [71:64] are written to bits [15:8] of the result. \n
4332/// Bits [79:72] are written to bits [31:24] of the result. \n
4333/// Bits [87:80] are written to bits [47:40] of the result. \n
4334/// Bits [95:88] are written to bits [63:56] of the result. \n
4335/// Bits [103:96] are written to bits [79:72] of the result. \n
4336/// Bits [111:104] are written to bits [95:88] of the result. \n
4337/// Bits [119:112] are written to bits [111:104] of the result. \n
4338/// Bits [127:120] are written to bits [127:120] of the result.
4339/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4340static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4341 __m128i __b) {
4342 return (__m128i)__builtin_shufflevector(
4343 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4344 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4345}
4346
4347/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4348/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4349///
4350/// \headerfile <x86intrin.h>
4351///
4352/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4353/// instruction.
4354///
4355/// \param __a
4356/// A 128-bit vector of [8 x i16].
4357/// Bits [79:64] are written to bits [15:0] of the result. \n
4358/// Bits [95:80] are written to bits [47:32] of the result. \n
4359/// Bits [111:96] are written to bits [79:64] of the result. \n
4360/// Bits [127:112] are written to bits [111:96] of the result.
4361/// \param __b
4362/// A 128-bit vector of [8 x i16].
4363/// Bits [79:64] are written to bits [31:16] of the result. \n
4364/// Bits [95:80] are written to bits [63:48] of the result. \n
4365/// Bits [111:96] are written to bits [95:80] of the result. \n
4366/// Bits [127:112] are written to bits [127:112] of the result.
4367/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4368static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4369 __m128i __b) {
4370 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4371 8 + 5, 6, 8 + 6, 7, 8 + 7);
4372}
4373
4374/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4375/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4376///
4377/// \headerfile <x86intrin.h>
4378///
4379/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4380/// instruction.
4381///
4382/// \param __a
4383/// A 128-bit vector of [4 x i32]. \n
4384/// Bits [95:64] are written to bits [31:0] of the destination. \n
4385/// Bits [127:96] are written to bits [95:64] of the destination.
4386/// \param __b
4387/// A 128-bit vector of [4 x i32]. \n
4388/// Bits [95:64] are written to bits [64:32] of the destination. \n
4389/// Bits [127:96] are written to bits [127:96] of the destination.
4390/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4391static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4392 __m128i __b) {
4393 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4394 4 + 3);
4395}
4396
4397/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4398/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4399///
4400/// \headerfile <x86intrin.h>
4401///
4402/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4403/// instruction.
4404///
4405/// \param __a
4406/// A 128-bit vector of [2 x i64]. \n
4407/// Bits [127:64] are written to bits [63:0] of the destination.
4408/// \param __b
4409/// A 128-bit vector of [2 x i64]. \n
4410/// Bits [127:64] are written to bits [127:64] of the destination.
4411/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4412static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4413 __m128i __b) {
4414 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4415}
4416
4417/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4418/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4419///
4420/// \headerfile <x86intrin.h>
4421///
4422/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4423/// instruction.
4424///
4425/// \param __a
4426/// A 128-bit vector of [16 x i8]. \n
4427/// Bits [7:0] are written to bits [7:0] of the result. \n
4428/// Bits [15:8] are written to bits [23:16] of the result. \n
4429/// Bits [23:16] are written to bits [39:32] of the result. \n
4430/// Bits [31:24] are written to bits [55:48] of the result. \n
4431/// Bits [39:32] are written to bits [71:64] of the result. \n
4432/// Bits [47:40] are written to bits [87:80] of the result. \n
4433/// Bits [55:48] are written to bits [103:96] of the result. \n
4434/// Bits [63:56] are written to bits [119:112] of the result.
4435/// \param __b
4436/// A 128-bit vector of [16 x i8].
4437/// Bits [7:0] are written to bits [15:8] of the result. \n
4438/// Bits [15:8] are written to bits [31:24] of the result. \n
4439/// Bits [23:16] are written to bits [47:40] of the result. \n
4440/// Bits [31:24] are written to bits [63:56] of the result. \n
4441/// Bits [39:32] are written to bits [79:72] of the result. \n
4442/// Bits [47:40] are written to bits [95:88] of the result. \n
4443/// Bits [55:48] are written to bits [111:104] of the result. \n
4444/// Bits [63:56] are written to bits [127:120] of the result.
4445/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4446static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4447 __m128i __b) {
4448 return (__m128i)__builtin_shufflevector(
4449 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4450 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4451}
4452
4453/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4454/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4455/// [8 x i16].
4456///
4457/// \headerfile <x86intrin.h>
4458///
4459/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4460/// instruction.
4461///
4462/// \param __a
4463/// A 128-bit vector of [8 x i16].
4464/// Bits [15:0] are written to bits [15:0] of the result. \n
4465/// Bits [31:16] are written to bits [47:32] of the result. \n
4466/// Bits [47:32] are written to bits [79:64] of the result. \n
4467/// Bits [63:48] are written to bits [111:96] of the result.
4468/// \param __b
4469/// A 128-bit vector of [8 x i16].
4470/// Bits [15:0] are written to bits [31:16] of the result. \n
4471/// Bits [31:16] are written to bits [63:48] of the result. \n
4472/// Bits [47:32] are written to bits [95:80] of the result. \n
4473/// Bits [63:48] are written to bits [127:112] of the result.
4474/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4475static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4476 __m128i __b) {
4477 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4478 8 + 1, 2, 8 + 2, 3, 8 + 3);
4479}
4480
4481/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4482/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4483///
4484/// \headerfile <x86intrin.h>
4485///
4486/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4487/// instruction.
4488///
4489/// \param __a
4490/// A 128-bit vector of [4 x i32]. \n
4491/// Bits [31:0] are written to bits [31:0] of the destination. \n
4492/// Bits [63:32] are written to bits [95:64] of the destination.
4493/// \param __b
4494/// A 128-bit vector of [4 x i32]. \n
4495/// Bits [31:0] are written to bits [64:32] of the destination. \n
4496/// Bits [63:32] are written to bits [127:96] of the destination.
4497/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4498static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4499 __m128i __b) {
4500 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4501 4 + 1);
4502}
4503
4504/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4505/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4506///
4507/// \headerfile <x86intrin.h>
4508///
4509/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4510/// instruction.
4511///
4512/// \param __a
4513/// A 128-bit vector of [2 x i64]. \n
4514/// Bits [63:0] are written to bits [63:0] of the destination. \n
4515/// \param __b
4516/// A 128-bit vector of [2 x i64]. \n
4517/// Bits [63:0] are written to bits [127:64] of the destination. \n
4518/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4519static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4520 __m128i __b) {
4521 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4522}
4523
4524/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4525/// integer.
4526///
4527/// \headerfile <x86intrin.h>
4528///
4529/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4530///
4531/// \param __a
4532/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4533/// destination.
4534/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4535static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4536 return (__m64)__a[0];
4537}
4538
4539/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4540/// upper bits.
4541///
4542/// \headerfile <x86intrin.h>
4543///
4544/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4545///
4546/// \param __a
4547/// A 64-bit value.
4548/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4549/// the operand. The upper 64 bits are assigned zeros.
4550static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4551 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4552}
4553
4554/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4555/// integer vector, zeroing the upper bits.
4556///
4557/// \headerfile <x86intrin.h>
4558///
4559/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4560///
4561/// \param __a
4562/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4563/// destination.
4564/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4565/// the operand. The upper 64 bits are assigned zeros.
4566static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4567 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4568}
4569
4570/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4571/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4572/// double].
4573///
4574/// \headerfile <x86intrin.h>
4575///
4576/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4577///
4578/// \param __a
4579/// A 128-bit vector of [2 x double]. \n
4580/// Bits [127:64] are written to bits [63:0] of the destination.
4581/// \param __b
4582/// A 128-bit vector of [2 x double]. \n
4583/// Bits [127:64] are written to bits [127:64] of the destination.
4584/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4585static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4586 __m128d __b) {
4587 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4588}
4589
4590/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4591/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4592/// double].
4593///
4594/// \headerfile <x86intrin.h>
4595///
4596/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4597///
4598/// \param __a
4599/// A 128-bit vector of [2 x double]. \n
4600/// Bits [63:0] are written to bits [63:0] of the destination.
4601/// \param __b
4602/// A 128-bit vector of [2 x double]. \n
4603/// Bits [63:0] are written to bits [127:64] of the destination.
4604/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4605static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4606 __m128d __b) {
4607 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4608}
4609
4610/// Extracts the sign bits of the double-precision values in the 128-bit
4611/// vector of [2 x double], zero-extends the value, and writes it to the
4612/// low-order bits of the destination.
4613///
4614/// \headerfile <x86intrin.h>
4615///
4616/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4617///
4618/// \param __a
4619/// A 128-bit vector of [2 x double] containing the values with sign bits to
4620/// be extracted.
4621/// \returns The sign bits from each of the double-precision elements in \a __a,
4622/// written to bits [1:0]. The remaining bits are assigned values of zero.
4623static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4624 return __builtin_ia32_movmskpd((__v2df)__a);
4625}
4626
4627/// Constructs a 128-bit floating-point vector of [2 x double] from two
4628/// 128-bit vector parameters of [2 x double], using the immediate-value
4629/// parameter as a specifier.
4630///
4631/// \headerfile <x86intrin.h>
4632///
4633/// \code
4634/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4635/// \endcode
4636///
4637/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4638///
4639/// \param a
4640/// A 128-bit vector of [2 x double].
4641/// \param b
4642/// A 128-bit vector of [2 x double].
4643/// \param i
4644/// An 8-bit immediate value. The least significant two bits specify which
4645/// elements to copy from \a a and \a b: \n
4646/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4647/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4648/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4649/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4650/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4651/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4652/// <c>[b1, b0]</c>.
4653/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4654#define _mm_shuffle_pd(a, b, i) \
4655 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4656 (int)(i)))
4657
4658/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4659/// floating-point vector of [4 x float].
4660///
4661/// \headerfile <x86intrin.h>
4662///
4663/// This intrinsic has no corresponding instruction.
4664///
4665/// \param __a
4666/// A 128-bit floating-point vector of [2 x double].
4667/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4668/// bitwise pattern as the parameter.
4669static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4670 return (__m128)__a;
4671}
4672
4673/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4674/// integer vector.
4675///
4676/// \headerfile <x86intrin.h>
4677///
4678/// This intrinsic has no corresponding instruction.
4679///
4680/// \param __a
4681/// A 128-bit floating-point vector of [2 x double].
4682/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4683/// parameter.
4684static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4685 return (__m128i)__a;
4686}
4687
4688/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4689/// floating-point vector of [2 x double].
4690///
4691/// \headerfile <x86intrin.h>
4692///
4693/// This intrinsic has no corresponding instruction.
4694///
4695/// \param __a
4696/// A 128-bit floating-point vector of [4 x float].
4697/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4698/// bitwise pattern as the parameter.
4699static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4700 return (__m128d)__a;
4701}
4702
4703/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4704/// integer vector.
4705///
4706/// \headerfile <x86intrin.h>
4707///
4708/// This intrinsic has no corresponding instruction.
4709///
4710/// \param __a
4711/// A 128-bit floating-point vector of [4 x float].
4712/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4713/// parameter.
4714static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4715 return (__m128i)__a;
4716}
4717
4718/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4719/// of [4 x float].
4720///
4721/// \headerfile <x86intrin.h>
4722///
4723/// This intrinsic has no corresponding instruction.
4724///
4725/// \param __a
4726/// A 128-bit integer vector.
4727/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4728/// bitwise pattern as the parameter.
4729static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4730 return (__m128)__a;
4731}
4732
4733/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4734/// of [2 x double].
4735///
4736/// \headerfile <x86intrin.h>
4737///
4738/// This intrinsic has no corresponding instruction.
4739///
4740/// \param __a
4741/// A 128-bit integer vector.
4742/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4743/// bitwise pattern as the parameter.
4744static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4745 return (__m128d)__a;
4746}
4747
4748#if defined(__cplusplus)
4749extern "C" {
4750#endif
4751
4752/// Indicates that a spin loop is being executed for the purposes of
4753/// optimizing power consumption during the loop.
4754///
4755/// \headerfile <x86intrin.h>
4756///
4757/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4758///
4759void _mm_pause(void);
4760
4761#if defined(__cplusplus)
4762} // extern "C"
4763#endif
4764#undef __DEFAULT_FN_ATTRS
4765#undef __DEFAULT_FN_ATTRS_MMX
4766
4767#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4768
4769#define _MM_DENORMALS_ZERO_ON (0x0040U)
4770#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4771
4772#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4773
4774#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4775#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4776 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4777
4778#endif /* __EMMINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1454
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3686
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1017
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4475
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4550
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1917
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3529
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:992
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1769
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2323
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:569
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4684
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:390
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1610
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:3981
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4207
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2775
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2626
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:795
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1165
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1573
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2523
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3354
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3922
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3489
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1190
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2118
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1517
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1297
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3038
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2966
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1753
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3175
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1787
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2471
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:719
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3941
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3195
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2661
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:504
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3893
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:294
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1648
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1629
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4585
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:743
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2645
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3093
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2984
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3056
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2380
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1115
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2830
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:407
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2222
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1897
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4744
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2948
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4446
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:768
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3115
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:943
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:52
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4605
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2545
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:695
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:649
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4519
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2868
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4623
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3074
static __inline__ void int __a
Definition: emmintrin.h:4001
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2609
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3873
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4566
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:464
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4412
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1439
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3265
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3323
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4498
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3135
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1400
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1840
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4368
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1472
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3421
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4714
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1215
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2247
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2184
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3308
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1591
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3002
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2721
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:589
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1353
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1330
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1699
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2162
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2567
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2454
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1719
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4391
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1258
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3787
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2342
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1067
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:549
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2096
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2361
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:373
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:967
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2266
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4669
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2811
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4086
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3633
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2849
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1557
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1042
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:629
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4699
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1973
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2757
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2437
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:918
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:822
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1992
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1878
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2058
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3384
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3667
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4109
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2739
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2016
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:894
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:484
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3369
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:671
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:3963
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3155
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4729
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1821
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:353
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3441
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1375
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2416
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:870
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3020
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:846
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3294
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3740
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1673
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3578
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3616
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1807
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3462
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2506
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:275
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2793
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3853
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1857
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1533
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:55
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:336
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2304
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4535
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3818
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:425
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1737
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1487
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:317
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1502
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3404
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3650
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3599
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2488
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2203
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2398
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1956
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4340
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2930
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1934
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1240
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2037
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2588
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2075
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3802
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:526
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:444
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3833
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2887
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3708
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1423
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3279
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2678
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:609
struct __storeu_i16 *__P __v
Definition: immintrin.h:525