clang 20.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __EMMINTRIN_H
11#define __EMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <xmmintrin.h>
18
19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26/* Type defines. */
27typedef double __v2df __attribute__((__vector_size__(16)));
28typedef long long __v2di __attribute__((__vector_size__(16)));
29typedef short __v8hi __attribute__((__vector_size__(16)));
30typedef char __v16qi __attribute__((__vector_size__(16)));
31
32/* Unsigned types */
33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37/* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41#ifdef __SSE2__
42/* Both _Float16 and __bf16 require SSE2 being enabled. */
43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49#endif
50
51/* Define the default attributes for the functions in this file. */
52#define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55
56#define __trunc64(x) \
57 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
58#define __anyext128(x) \
59 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
60 1, -1, -1)
61
62/// Adds lower double-precision values in both operands and returns the
63/// sum in the lower 64 bits of the result. The upper 64 bits of the result
64/// are copied from the upper double-precision value of the first operand.
65///
66/// \headerfile <x86intrin.h>
67///
68/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
69///
70/// \param __a
71/// A 128-bit vector of [2 x double] containing one of the source operands.
72/// \param __b
73/// A 128-bit vector of [2 x double] containing one of the source operands.
74/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
75/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
76/// from the upper 64 bits of the first source operand.
77static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
78 __m128d __b) {
79 __a[0] += __b[0];
80 return __a;
81}
82
83/// Adds two 128-bit vectors of [2 x double].
84///
85/// \headerfile <x86intrin.h>
86///
87/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
88///
89/// \param __a
90/// A 128-bit vector of [2 x double] containing one of the source operands.
91/// \param __b
92/// A 128-bit vector of [2 x double] containing one of the source operands.
93/// \returns A 128-bit vector of [2 x double] containing the sums of both
94/// operands.
95static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
96 __m128d __b) {
97 return (__m128d)((__v2df)__a + (__v2df)__b);
98}
99
100/// Subtracts the lower double-precision value of the second operand
101/// from the lower double-precision value of the first operand and returns
102/// the difference in the lower 64 bits of the result. The upper 64 bits of
103/// the result are copied from the upper double-precision value of the first
104/// operand.
105///
106/// \headerfile <x86intrin.h>
107///
108/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
109///
110/// \param __a
111/// A 128-bit vector of [2 x double] containing the minuend.
112/// \param __b
113/// A 128-bit vector of [2 x double] containing the subtrahend.
114/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
115/// difference of the lower 64 bits of both operands. The upper 64 bits are
116/// copied from the upper 64 bits of the first source operand.
117static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
118 __m128d __b) {
119 __a[0] -= __b[0];
120 return __a;
121}
122
123/// Subtracts two 128-bit vectors of [2 x double].
124///
125/// \headerfile <x86intrin.h>
126///
127/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
128///
129/// \param __a
130/// A 128-bit vector of [2 x double] containing the minuend.
131/// \param __b
132/// A 128-bit vector of [2 x double] containing the subtrahend.
133/// \returns A 128-bit vector of [2 x double] containing the differences between
134/// both operands.
135static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
136 __m128d __b) {
137 return (__m128d)((__v2df)__a - (__v2df)__b);
138}
139
140/// Multiplies lower double-precision values in both operands and returns
141/// the product in the lower 64 bits of the result. The upper 64 bits of the
142/// result are copied from the upper double-precision value of the first
143/// operand.
144///
145/// \headerfile <x86intrin.h>
146///
147/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
148///
149/// \param __a
150/// A 128-bit vector of [2 x double] containing one of the source operands.
151/// \param __b
152/// A 128-bit vector of [2 x double] containing one of the source operands.
153/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
154/// product of the lower 64 bits of both operands. The upper 64 bits are
155/// copied from the upper 64 bits of the first source operand.
156static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
157 __m128d __b) {
158 __a[0] *= __b[0];
159 return __a;
160}
161
162/// Multiplies two 128-bit vectors of [2 x double].
163///
164/// \headerfile <x86intrin.h>
165///
166/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
167///
168/// \param __a
169/// A 128-bit vector of [2 x double] containing one of the operands.
170/// \param __b
171/// A 128-bit vector of [2 x double] containing one of the operands.
172/// \returns A 128-bit vector of [2 x double] containing the products of both
173/// operands.
174static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
175 __m128d __b) {
176 return (__m128d)((__v2df)__a * (__v2df)__b);
177}
178
179/// Divides the lower double-precision value of the first operand by the
180/// lower double-precision value of the second operand and returns the
181/// quotient in the lower 64 bits of the result. The upper 64 bits of the
182/// result are copied from the upper double-precision value of the first
183/// operand.
184///
185/// \headerfile <x86intrin.h>
186///
187/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
188///
189/// \param __a
190/// A 128-bit vector of [2 x double] containing the dividend.
191/// \param __b
192/// A 128-bit vector of [2 x double] containing divisor.
193/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
194/// quotient of the lower 64 bits of both operands. The upper 64 bits are
195/// copied from the upper 64 bits of the first source operand.
196static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
197 __m128d __b) {
198 __a[0] /= __b[0];
199 return __a;
200}
201
202/// Performs an element-by-element division of two 128-bit vectors of
203/// [2 x double].
204///
205/// \headerfile <x86intrin.h>
206///
207/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
208///
209/// \param __a
210/// A 128-bit vector of [2 x double] containing the dividend.
211/// \param __b
212/// A 128-bit vector of [2 x double] containing the divisor.
213/// \returns A 128-bit vector of [2 x double] containing the quotients of both
214/// operands.
215static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
216 __m128d __b) {
217 return (__m128d)((__v2df)__a / (__v2df)__b);
218}
219
220/// Calculates the square root of the lower double-precision value of
221/// the second operand and returns it in the lower 64 bits of the result.
222/// The upper 64 bits of the result are copied from the upper
223/// double-precision value of the first operand.
224///
225/// \headerfile <x86intrin.h>
226///
227/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
228///
229/// \param __a
230/// A 128-bit vector of [2 x double] containing one of the operands. The
231/// upper 64 bits of this operand are copied to the upper 64 bits of the
232/// result.
233/// \param __b
234/// A 128-bit vector of [2 x double] containing one of the operands. The
235/// square root is calculated using the lower 64 bits of this operand.
236/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
237/// square root of the lower 64 bits of operand \a __b, and whose upper 64
238/// bits are copied from the upper 64 bits of operand \a __a.
239static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
240 __m128d __b) {
241 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
242 return __extension__(__m128d){__c[0], __a[1]};
243}
244
245/// Calculates the square root of the each of two values stored in a
246/// 128-bit vector of [2 x double].
247///
248/// \headerfile <x86intrin.h>
249///
250/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
251///
252/// \param __a
253/// A 128-bit vector of [2 x double].
254/// \returns A 128-bit vector of [2 x double] containing the square roots of the
255/// values in the operand.
256static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
257 return __builtin_ia32_sqrtpd((__v2df)__a);
258}
259
260/// Compares lower 64-bit double-precision values of both operands, and
261/// returns the lesser of the pair of values in the lower 64-bits of the
262/// result. The upper 64 bits of the result are copied from the upper
263/// double-precision value of the first operand.
264///
265/// If either value in a comparison is NaN, returns the value from \a __b.
266///
267/// \headerfile <x86intrin.h>
268///
269/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
270///
271/// \param __a
272/// A 128-bit vector of [2 x double] containing one of the operands. The
273/// lower 64 bits of this operand are used in the comparison.
274/// \param __b
275/// A 128-bit vector of [2 x double] containing one of the operands. The
276/// lower 64 bits of this operand are used in the comparison.
277/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
278/// minimum value between both operands. The upper 64 bits are copied from
279/// the upper 64 bits of the first source operand.
280static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
281 __m128d __b) {
282 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
283}
284
285/// Performs element-by-element comparison of the two 128-bit vectors of
286/// [2 x double] and returns a vector containing the lesser of each pair of
287/// values.
288///
289/// If either value in a comparison is NaN, returns the value from \a __b.
290///
291/// \headerfile <x86intrin.h>
292///
293/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
294///
295/// \param __a
296/// A 128-bit vector of [2 x double] containing one of the operands.
297/// \param __b
298/// A 128-bit vector of [2 x double] containing one of the operands.
299/// \returns A 128-bit vector of [2 x double] containing the minimum values
300/// between both operands.
301static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
302 __m128d __b) {
303 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
304}
305
306/// Compares lower 64-bit double-precision values of both operands, and
307/// returns the greater of the pair of values in the lower 64-bits of the
308/// result. The upper 64 bits of the result are copied from the upper
309/// double-precision value of the first operand.
310///
311/// If either value in a comparison is NaN, returns the value from \a __b.
312///
313/// \headerfile <x86intrin.h>
314///
315/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
316///
317/// \param __a
318/// A 128-bit vector of [2 x double] containing one of the operands. The
319/// lower 64 bits of this operand are used in the comparison.
320/// \param __b
321/// A 128-bit vector of [2 x double] containing one of the operands. The
322/// lower 64 bits of this operand are used in the comparison.
323/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
324/// maximum value between both operands. The upper 64 bits are copied from
325/// the upper 64 bits of the first source operand.
326static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
327 __m128d __b) {
328 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
329}
330
331/// Performs element-by-element comparison of the two 128-bit vectors of
332/// [2 x double] and returns a vector containing the greater of each pair
333/// of values.
334///
335/// If either value in a comparison is NaN, returns the value from \a __b.
336///
337/// \headerfile <x86intrin.h>
338///
339/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
340///
341/// \param __a
342/// A 128-bit vector of [2 x double] containing one of the operands.
343/// \param __b
344/// A 128-bit vector of [2 x double] containing one of the operands.
345/// \returns A 128-bit vector of [2 x double] containing the maximum values
346/// between both operands.
347static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
348 __m128d __b) {
349 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
350}
351
352/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
353///
354/// \headerfile <x86intrin.h>
355///
356/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
357///
358/// \param __a
359/// A 128-bit vector of [2 x double] containing one of the source operands.
360/// \param __b
361/// A 128-bit vector of [2 x double] containing one of the source operands.
362/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
363/// values between both operands.
364static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
365 __m128d __b) {
366 return (__m128d)((__v2du)__a & (__v2du)__b);
367}
368
369/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
370/// the one's complement of the values contained in the first source operand.
371///
372/// \headerfile <x86intrin.h>
373///
374/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
375///
376/// \param __a
377/// A 128-bit vector of [2 x double] containing the left source operand. The
378/// one's complement of this value is used in the bitwise AND.
379/// \param __b
380/// A 128-bit vector of [2 x double] containing the right source operand.
381/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
382/// values in the second operand and the one's complement of the first
383/// operand.
384static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
385 __m128d __b) {
386 return (__m128d)(~(__v2du)__a & (__v2du)__b);
387}
388
389/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
390///
391/// \headerfile <x86intrin.h>
392///
393/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
394///
395/// \param __a
396/// A 128-bit vector of [2 x double] containing one of the source operands.
397/// \param __b
398/// A 128-bit vector of [2 x double] containing one of the source operands.
399/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
400/// values between both operands.
401static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
402 __m128d __b) {
403 return (__m128d)((__v2du)__a | (__v2du)__b);
404}
405
406/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
407///
408/// \headerfile <x86intrin.h>
409///
410/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
411///
412/// \param __a
413/// A 128-bit vector of [2 x double] containing one of the source operands.
414/// \param __b
415/// A 128-bit vector of [2 x double] containing one of the source operands.
416/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
417/// values between both operands.
418static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
419 __m128d __b) {
420 return (__m128d)((__v2du)__a ^ (__v2du)__b);
421}
422
423/// Compares each of the corresponding double-precision values of the
424/// 128-bit vectors of [2 x double] for equality.
425///
426/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
427/// If either value in a comparison is NaN, returns false.
428///
429/// \headerfile <x86intrin.h>
430///
431/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
432///
433/// \param __a
434/// A 128-bit vector of [2 x double].
435/// \param __b
436/// A 128-bit vector of [2 x double].
437/// \returns A 128-bit vector containing the comparison results.
438static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
439 __m128d __b) {
440 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
441}
442
443/// Compares each of the corresponding double-precision values of the
444/// 128-bit vectors of [2 x double] to determine if the values in the first
445/// operand are less than those in the second operand.
446///
447/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
448/// If either value in a comparison is NaN, returns false.
449///
450/// \headerfile <x86intrin.h>
451///
452/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
453///
454/// \param __a
455/// A 128-bit vector of [2 x double].
456/// \param __b
457/// A 128-bit vector of [2 x double].
458/// \returns A 128-bit vector containing the comparison results.
459static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
460 __m128d __b) {
461 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
462}
463
464/// Compares each of the corresponding double-precision values of the
465/// 128-bit vectors of [2 x double] to determine if the values in the first
466/// operand are less than or equal to those in the second operand.
467///
468/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
469/// If either value in a comparison is NaN, returns false.
470///
471/// \headerfile <x86intrin.h>
472///
473/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
474///
475/// \param __a
476/// A 128-bit vector of [2 x double].
477/// \param __b
478/// A 128-bit vector of [2 x double].
479/// \returns A 128-bit vector containing the comparison results.
480static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
481 __m128d __b) {
482 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
483}
484
485/// Compares each of the corresponding double-precision values of the
486/// 128-bit vectors of [2 x double] to determine if the values in the first
487/// operand are greater than those in the second operand.
488///
489/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
490/// If either value in a comparison is NaN, returns false.
491///
492/// \headerfile <x86intrin.h>
493///
494/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
495///
496/// \param __a
497/// A 128-bit vector of [2 x double].
498/// \param __b
499/// A 128-bit vector of [2 x double].
500/// \returns A 128-bit vector containing the comparison results.
501static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
502 __m128d __b) {
503 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
504}
505
506/// Compares each of the corresponding double-precision values of the
507/// 128-bit vectors of [2 x double] to determine if the values in the first
508/// operand are greater than or equal to those in the second operand.
509///
510/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
511/// If either value in a comparison is NaN, returns false.
512///
513/// \headerfile <x86intrin.h>
514///
515/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
516///
517/// \param __a
518/// A 128-bit vector of [2 x double].
519/// \param __b
520/// A 128-bit vector of [2 x double].
521/// \returns A 128-bit vector containing the comparison results.
522static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
523 __m128d __b) {
524 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
525}
526
527/// Compares each of the corresponding double-precision values of the
528/// 128-bit vectors of [2 x double] to determine if the values in the first
529/// operand are ordered with respect to those in the second operand.
530///
531/// A pair of double-precision values are ordered with respect to each
532/// other if neither value is a NaN. Each comparison returns 0x0 for false,
533/// 0xFFFFFFFFFFFFFFFF for true.
534///
535/// \headerfile <x86intrin.h>
536///
537/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
538///
539/// \param __a
540/// A 128-bit vector of [2 x double].
541/// \param __b
542/// A 128-bit vector of [2 x double].
543/// \returns A 128-bit vector containing the comparison results.
544static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
545 __m128d __b) {
546 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
547}
548
549/// Compares each of the corresponding double-precision values of the
550/// 128-bit vectors of [2 x double] to determine if the values in the first
551/// operand are unordered with respect to those in the second operand.
552///
553/// A pair of double-precision values are unordered with respect to each
554/// other if one or both values are NaN. Each comparison returns 0x0 for
555/// false, 0xFFFFFFFFFFFFFFFF for true.
556///
557/// \headerfile <x86intrin.h>
558///
559/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
560/// instruction.
561///
562/// \param __a
563/// A 128-bit vector of [2 x double].
564/// \param __b
565/// A 128-bit vector of [2 x double].
566/// \returns A 128-bit vector containing the comparison results.
567static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
568 __m128d __b) {
569 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
570}
571
572/// Compares each of the corresponding double-precision values of the
573/// 128-bit vectors of [2 x double] to determine if the values in the first
574/// operand are unequal to those in the second operand.
575///
576/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
577/// If either value in a comparison is NaN, returns true.
578///
579/// \headerfile <x86intrin.h>
580///
581/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
582///
583/// \param __a
584/// A 128-bit vector of [2 x double].
585/// \param __b
586/// A 128-bit vector of [2 x double].
587/// \returns A 128-bit vector containing the comparison results.
588static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
589 __m128d __b) {
590 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
591}
592
593/// Compares each of the corresponding double-precision values of the
594/// 128-bit vectors of [2 x double] to determine if the values in the first
595/// operand are not less than those in the second operand.
596///
597/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
598/// If either value in a comparison is NaN, returns true.
599///
600/// \headerfile <x86intrin.h>
601///
602/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
603///
604/// \param __a
605/// A 128-bit vector of [2 x double].
606/// \param __b
607/// A 128-bit vector of [2 x double].
608/// \returns A 128-bit vector containing the comparison results.
609static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
610 __m128d __b) {
611 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
612}
613
614/// Compares each of the corresponding double-precision values of the
615/// 128-bit vectors of [2 x double] to determine if the values in the first
616/// operand are not less than or equal to those in the second operand.
617///
618/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619/// If either value in a comparison is NaN, returns true.
620///
621/// \headerfile <x86intrin.h>
622///
623/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
624///
625/// \param __a
626/// A 128-bit vector of [2 x double].
627/// \param __b
628/// A 128-bit vector of [2 x double].
629/// \returns A 128-bit vector containing the comparison results.
630static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
631 __m128d __b) {
632 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
633}
634
635/// Compares each of the corresponding double-precision values of the
636/// 128-bit vectors of [2 x double] to determine if the values in the first
637/// operand are not greater than those in the second operand.
638///
639/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
640/// If either value in a comparison is NaN, returns true.
641///
642/// \headerfile <x86intrin.h>
643///
644/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
645///
646/// \param __a
647/// A 128-bit vector of [2 x double].
648/// \param __b
649/// A 128-bit vector of [2 x double].
650/// \returns A 128-bit vector containing the comparison results.
651static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
652 __m128d __b) {
653 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
654}
655
656/// Compares each of the corresponding double-precision values of the
657/// 128-bit vectors of [2 x double] to determine if the values in the first
658/// operand are not greater than or equal to those in the second operand.
659///
660/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
661/// If either value in a comparison is NaN, returns true.
662///
663/// \headerfile <x86intrin.h>
664///
665/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
666///
667/// \param __a
668/// A 128-bit vector of [2 x double].
669/// \param __b
670/// A 128-bit vector of [2 x double].
671/// \returns A 128-bit vector containing the comparison results.
672static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
673 __m128d __b) {
674 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
675}
676
677/// Compares the lower double-precision floating-point values in each of
678/// the two 128-bit floating-point vectors of [2 x double] for equality.
679///
680/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
681/// If either value in a comparison is NaN, returns false.
682///
683/// \headerfile <x86intrin.h>
684///
685/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
686///
687/// \param __a
688/// A 128-bit vector of [2 x double]. The lower double-precision value is
689/// compared to the lower double-precision value of \a __b.
690/// \param __b
691/// A 128-bit vector of [2 x double]. The lower double-precision value is
692/// compared to the lower double-precision value of \a __a.
693/// \returns A 128-bit vector. The lower 64 bits contains the comparison
694/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
696 __m128d __b) {
697 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
698}
699
700/// Compares the lower double-precision floating-point values in each of
701/// the two 128-bit floating-point vectors of [2 x double] to determine if
702/// the value in the first parameter is less than the corresponding value in
703/// the second parameter.
704///
705/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706/// If either value in a comparison is NaN, returns false.
707///
708/// \headerfile <x86intrin.h>
709///
710/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
711///
712/// \param __a
713/// A 128-bit vector of [2 x double]. The lower double-precision value is
714/// compared to the lower double-precision value of \a __b.
715/// \param __b
716/// A 128-bit vector of [2 x double]. The lower double-precision value is
717/// compared to the lower double-precision value of \a __a.
718/// \returns A 128-bit vector. The lower 64 bits contains the comparison
719/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
720static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
721 __m128d __b) {
722 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
723}
724
725/// Compares the lower double-precision floating-point values in each of
726/// the two 128-bit floating-point vectors of [2 x double] to determine if
727/// the value in the first parameter is less than or equal to the
728/// corresponding value in the second parameter.
729///
730/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
731/// If either value in a comparison is NaN, returns false.
732///
733/// \headerfile <x86intrin.h>
734///
735/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
736///
737/// \param __a
738/// A 128-bit vector of [2 x double]. The lower double-precision value is
739/// compared to the lower double-precision value of \a __b.
740/// \param __b
741/// A 128-bit vector of [2 x double]. The lower double-precision value is
742/// compared to the lower double-precision value of \a __a.
743/// \returns A 128-bit vector. The lower 64 bits contains the comparison
744/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
745static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
746 __m128d __b) {
747 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
748}
749
750/// Compares the lower double-precision floating-point values in each of
751/// the two 128-bit floating-point vectors of [2 x double] to determine if
752/// the value in the first parameter is greater than the corresponding value
753/// in the second parameter.
754///
755/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
756/// If either value in a comparison is NaN, returns false.
757///
758/// \headerfile <x86intrin.h>
759///
760/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
761///
762/// \param __a
763/// A 128-bit vector of [2 x double]. The lower double-precision value is
764/// compared to the lower double-precision value of \a __b.
765/// \param __b
766/// A 128-bit vector of [2 x double]. The lower double-precision value is
767/// compared to the lower double-precision value of \a __a.
768/// \returns A 128-bit vector. The lower 64 bits contains the comparison
769/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
770static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
771 __m128d __b) {
772 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
773 return __extension__(__m128d){__c[0], __a[1]};
774}
775
776/// Compares the lower double-precision floating-point values in each of
777/// the two 128-bit floating-point vectors of [2 x double] to determine if
778/// the value in the first parameter is greater than or equal to the
779/// corresponding value in the second parameter.
780///
781/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
782/// If either value in a comparison is NaN, returns false.
783///
784/// \headerfile <x86intrin.h>
785///
786/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
787///
788/// \param __a
789/// A 128-bit vector of [2 x double]. The lower double-precision value is
790/// compared to the lower double-precision value of \a __b.
791/// \param __b
792/// A 128-bit vector of [2 x double]. The lower double-precision value is
793/// compared to the lower double-precision value of \a __a.
794/// \returns A 128-bit vector. The lower 64 bits contains the comparison
795/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
796static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
797 __m128d __b) {
798 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
799 return __extension__(__m128d){__c[0], __a[1]};
800}
801
802/// Compares the lower double-precision floating-point values in each of
803/// the two 128-bit floating-point vectors of [2 x double] to determine if
804/// the value in the first parameter is ordered with respect to the
805/// corresponding value in the second parameter.
806///
807/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
808/// of double-precision values are ordered with respect to each other if
809/// neither value is a NaN.
810///
811/// \headerfile <x86intrin.h>
812///
813/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
814///
815/// \param __a
816/// A 128-bit vector of [2 x double]. The lower double-precision value is
817/// compared to the lower double-precision value of \a __b.
818/// \param __b
819/// A 128-bit vector of [2 x double]. The lower double-precision value is
820/// compared to the lower double-precision value of \a __a.
821/// \returns A 128-bit vector. The lower 64 bits contains the comparison
822/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
823static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
824 __m128d __b) {
825 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
826}
827
828/// Compares the lower double-precision floating-point values in each of
829/// the two 128-bit floating-point vectors of [2 x double] to determine if
830/// the value in the first parameter is unordered with respect to the
831/// corresponding value in the second parameter.
832///
833/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
834/// of double-precision values are unordered with respect to each other if
835/// one or both values are NaN.
836///
837/// \headerfile <x86intrin.h>
838///
839/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
840/// instruction.
841///
842/// \param __a
843/// A 128-bit vector of [2 x double]. The lower double-precision value is
844/// compared to the lower double-precision value of \a __b.
845/// \param __b
846/// A 128-bit vector of [2 x double]. The lower double-precision value is
847/// compared to the lower double-precision value of \a __a.
848/// \returns A 128-bit vector. The lower 64 bits contains the comparison
849/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
850static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
851 __m128d __b) {
852 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
853}
854
855/// Compares the lower double-precision floating-point values in each of
856/// the two 128-bit floating-point vectors of [2 x double] to determine if
857/// the value in the first parameter is unequal to the corresponding value in
858/// the second parameter.
859///
860/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
861/// If either value in a comparison is NaN, returns true.
862///
863/// \headerfile <x86intrin.h>
864///
865/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
866///
867/// \param __a
868/// A 128-bit vector of [2 x double]. The lower double-precision value is
869/// compared to the lower double-precision value of \a __b.
870/// \param __b
871/// A 128-bit vector of [2 x double]. The lower double-precision value is
872/// compared to the lower double-precision value of \a __a.
873/// \returns A 128-bit vector. The lower 64 bits contains the comparison
874/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
875static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
876 __m128d __b) {
877 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
878}
879
880/// Compares the lower double-precision floating-point values in each of
881/// the two 128-bit floating-point vectors of [2 x double] to determine if
882/// the value in the first parameter is not less than the corresponding
883/// value in the second parameter.
884///
885/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
886/// If either value in a comparison is NaN, returns true.
887///
888/// \headerfile <x86intrin.h>
889///
890/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
891///
892/// \param __a
893/// A 128-bit vector of [2 x double]. The lower double-precision value is
894/// compared to the lower double-precision value of \a __b.
895/// \param __b
896/// A 128-bit vector of [2 x double]. The lower double-precision value is
897/// compared to the lower double-precision value of \a __a.
898/// \returns A 128-bit vector. The lower 64 bits contains the comparison
899/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
900static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
901 __m128d __b) {
902 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
903}
904
905/// Compares the lower double-precision floating-point values in each of
906/// the two 128-bit floating-point vectors of [2 x double] to determine if
907/// the value in the first parameter is not less than or equal to the
908/// corresponding value in the second parameter.
909///
910/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
911/// If either value in a comparison is NaN, returns true.
912///
913/// \headerfile <x86intrin.h>
914///
915/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
916///
917/// \param __a
918/// A 128-bit vector of [2 x double]. The lower double-precision value is
919/// compared to the lower double-precision value of \a __b.
920/// \param __b
921/// A 128-bit vector of [2 x double]. The lower double-precision value is
922/// compared to the lower double-precision value of \a __a.
923/// \returns A 128-bit vector. The lower 64 bits contains the comparison
924/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
925static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
926 __m128d __b) {
927 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
928}
929
930/// Compares the lower double-precision floating-point values in each of
931/// the two 128-bit floating-point vectors of [2 x double] to determine if
932/// the value in the first parameter is not greater than the corresponding
933/// value in the second parameter.
934///
935/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
936/// If either value in a comparison is NaN, returns true.
937///
938/// \headerfile <x86intrin.h>
939///
940/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
941///
942/// \param __a
943/// A 128-bit vector of [2 x double]. The lower double-precision value is
944/// compared to the lower double-precision value of \a __b.
945/// \param __b
946/// A 128-bit vector of [2 x double]. The lower double-precision value is
947/// compared to the lower double-precision value of \a __a.
948/// \returns A 128-bit vector. The lower 64 bits contains the comparison
949/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
950static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
951 __m128d __b) {
952 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
953 return __extension__(__m128d){__c[0], __a[1]};
954}
955
956/// Compares the lower double-precision floating-point values in each of
957/// the two 128-bit floating-point vectors of [2 x double] to determine if
958/// the value in the first parameter is not greater than or equal to the
959/// corresponding value in the second parameter.
960///
961/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
962/// If either value in a comparison is NaN, returns true.
963///
964/// \headerfile <x86intrin.h>
965///
966/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
967///
968/// \param __a
969/// A 128-bit vector of [2 x double]. The lower double-precision value is
970/// compared to the lower double-precision value of \a __b.
971/// \param __b
972/// A 128-bit vector of [2 x double]. The lower double-precision value is
973/// compared to the lower double-precision value of \a __a.
974/// \returns A 128-bit vector. The lower 64 bits contains the comparison
975/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
976static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
977 __m128d __b) {
978 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
979 return __extension__(__m128d){__c[0], __a[1]};
980}
981
982/// Compares the lower double-precision floating-point values in each of
983/// the two 128-bit floating-point vectors of [2 x double] for equality.
984///
985/// The comparison returns 0 for false, 1 for true. If either value in a
986/// comparison is NaN, returns 0.
987///
988/// \headerfile <x86intrin.h>
989///
990/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
991///
992/// \param __a
993/// A 128-bit vector of [2 x double]. The lower double-precision value is
994/// compared to the lower double-precision value of \a __b.
995/// \param __b
996/// A 128-bit vector of [2 x double]. The lower double-precision value is
997/// compared to the lower double-precision value of \a __a.
998/// \returns An integer containing the comparison results.
999static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1000 __m128d __b) {
1001 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1002}
1003
1004/// Compares the lower double-precision floating-point values in each of
1005/// the two 128-bit floating-point vectors of [2 x double] to determine if
1006/// the value in the first parameter is less than the corresponding value in
1007/// the second parameter.
1008///
1009/// The comparison returns 0 for false, 1 for true. If either value in a
1010/// comparison is NaN, returns 0.
1011///
1012/// \headerfile <x86intrin.h>
1013///
1014/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1015///
1016/// \param __a
1017/// A 128-bit vector of [2 x double]. The lower double-precision value is
1018/// compared to the lower double-precision value of \a __b.
1019/// \param __b
1020/// A 128-bit vector of [2 x double]. The lower double-precision value is
1021/// compared to the lower double-precision value of \a __a.
1022/// \returns An integer containing the comparison results.
1023static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1024 __m128d __b) {
1025 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1026}
1027
1028/// Compares the lower double-precision floating-point values in each of
1029/// the two 128-bit floating-point vectors of [2 x double] to determine if
1030/// the value in the first parameter is less than or equal to the
1031/// corresponding value in the second parameter.
1032///
1033/// The comparison returns 0 for false, 1 for true. If either value in a
1034/// comparison is NaN, returns 0.
1035///
1036/// \headerfile <x86intrin.h>
1037///
1038/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1039///
1040/// \param __a
1041/// A 128-bit vector of [2 x double]. The lower double-precision value is
1042/// compared to the lower double-precision value of \a __b.
1043/// \param __b
1044/// A 128-bit vector of [2 x double]. The lower double-precision value is
1045/// compared to the lower double-precision value of \a __a.
1046/// \returns An integer containing the comparison results.
1047static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1048 __m128d __b) {
1049 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1050}
1051
1052/// Compares the lower double-precision floating-point values in each of
1053/// the two 128-bit floating-point vectors of [2 x double] to determine if
1054/// the value in the first parameter is greater than the corresponding value
1055/// in the second parameter.
1056///
1057/// The comparison returns 0 for false, 1 for true. If either value in a
1058/// comparison is NaN, returns 0.
1059///
1060/// \headerfile <x86intrin.h>
1061///
1062/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1063///
1064/// \param __a
1065/// A 128-bit vector of [2 x double]. The lower double-precision value is
1066/// compared to the lower double-precision value of \a __b.
1067/// \param __b
1068/// A 128-bit vector of [2 x double]. The lower double-precision value is
1069/// compared to the lower double-precision value of \a __a.
1070/// \returns An integer containing the comparison results.
1071static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1072 __m128d __b) {
1073 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1074}
1075
1076/// Compares the lower double-precision floating-point values in each of
1077/// the two 128-bit floating-point vectors of [2 x double] to determine if
1078/// the value in the first parameter is greater than or equal to the
1079/// corresponding value in the second parameter.
1080///
1081/// The comparison returns 0 for false, 1 for true. If either value in a
1082/// comparison is NaN, returns 0.
1083///
1084/// \headerfile <x86intrin.h>
1085///
1086/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1087///
1088/// \param __a
1089/// A 128-bit vector of [2 x double]. The lower double-precision value is
1090/// compared to the lower double-precision value of \a __b.
1091/// \param __b
1092/// A 128-bit vector of [2 x double]. The lower double-precision value is
1093/// compared to the lower double-precision value of \a __a.
1094/// \returns An integer containing the comparison results.
1095static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1096 __m128d __b) {
1097 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1098}
1099
1100/// Compares the lower double-precision floating-point values in each of
1101/// the two 128-bit floating-point vectors of [2 x double] to determine if
1102/// the value in the first parameter is unequal to the corresponding value in
1103/// the second parameter.
1104///
1105/// The comparison returns 0 for false, 1 for true. If either value in a
1106/// comparison is NaN, returns 1.
1107///
1108/// \headerfile <x86intrin.h>
1109///
1110/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1111///
1112/// \param __a
1113/// A 128-bit vector of [2 x double]. The lower double-precision value is
1114/// compared to the lower double-precision value of \a __b.
1115/// \param __b
1116/// A 128-bit vector of [2 x double]. The lower double-precision value is
1117/// compared to the lower double-precision value of \a __a.
1118/// \returns An integer containing the comparison results.
1119static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1120 __m128d __b) {
1121 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1122}
1123
1124/// Compares the lower double-precision floating-point values in each of
1125/// the two 128-bit floating-point vectors of [2 x double] for equality.
1126///
1127/// The comparison returns 0 for false, 1 for true. If either value in a
1128/// comparison is NaN, returns 0.
1129///
1130/// \headerfile <x86intrin.h>
1131///
1132/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1133///
1134/// \param __a
1135/// A 128-bit vector of [2 x double]. The lower double-precision value is
1136/// compared to the lower double-precision value of \a __b.
1137/// \param __b
1138/// A 128-bit vector of [2 x double]. The lower double-precision value is
1139/// compared to the lower double-precision value of \a __a.
1140/// \returns An integer containing the comparison results.
1141static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1142 __m128d __b) {
1143 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1144}
1145
1146/// Compares the lower double-precision floating-point values in each of
1147/// the two 128-bit floating-point vectors of [2 x double] to determine if
1148/// the value in the first parameter is less than the corresponding value in
1149/// the second parameter.
1150///
1151/// The comparison returns 0 for false, 1 for true. If either value in a
1152/// comparison is NaN, returns 0.
1153///
1154/// \headerfile <x86intrin.h>
1155///
1156/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1157///
1158/// \param __a
1159/// A 128-bit vector of [2 x double]. The lower double-precision value is
1160/// compared to the lower double-precision value of \a __b.
1161/// \param __b
1162/// A 128-bit vector of [2 x double]. The lower double-precision value is
1163/// compared to the lower double-precision value of \a __a.
1164/// \returns An integer containing the comparison results.
1165static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1166 __m128d __b) {
1167 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1168}
1169
1170/// Compares the lower double-precision floating-point values in each of
1171/// the two 128-bit floating-point vectors of [2 x double] to determine if
1172/// the value in the first parameter is less than or equal to the
1173/// corresponding value in the second parameter.
1174///
1175/// The comparison returns 0 for false, 1 for true. If either value in a
1176/// comparison is NaN, returns 0.
1177///
1178/// \headerfile <x86intrin.h>
1179///
1180/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181///
1182/// \param __a
1183/// A 128-bit vector of [2 x double]. The lower double-precision value is
1184/// compared to the lower double-precision value of \a __b.
1185/// \param __b
1186/// A 128-bit vector of [2 x double]. The lower double-precision value is
1187/// compared to the lower double-precision value of \a __a.
1188/// \returns An integer containing the comparison results.
1189static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1190 __m128d __b) {
1191 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1192}
1193
1194/// Compares the lower double-precision floating-point values in each of
1195/// the two 128-bit floating-point vectors of [2 x double] to determine if
1196/// the value in the first parameter is greater than the corresponding value
1197/// in the second parameter.
1198///
1199/// The comparison returns 0 for false, 1 for true. If either value in a
1200/// comparison is NaN, returns 0.
1201///
1202/// \headerfile <x86intrin.h>
1203///
1204/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1205///
1206/// \param __a
1207/// A 128-bit vector of [2 x double]. The lower double-precision value is
1208/// compared to the lower double-precision value of \a __b.
1209/// \param __b
1210/// A 128-bit vector of [2 x double]. The lower double-precision value is
1211/// compared to the lower double-precision value of \a __a.
1212/// \returns An integer containing the comparison results.
1213static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1214 __m128d __b) {
1215 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1216}
1217
1218/// Compares the lower double-precision floating-point values in each of
1219/// the two 128-bit floating-point vectors of [2 x double] to determine if
1220/// the value in the first parameter is greater than or equal to the
1221/// corresponding value in the second parameter.
1222///
1223/// The comparison returns 0 for false, 1 for true. If either value in a
1224/// comparison is NaN, returns 0.
1225///
1226/// \headerfile <x86intrin.h>
1227///
1228/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1229///
1230/// \param __a
1231/// A 128-bit vector of [2 x double]. The lower double-precision value is
1232/// compared to the lower double-precision value of \a __b.
1233/// \param __b
1234/// A 128-bit vector of [2 x double]. The lower double-precision value is
1235/// compared to the lower double-precision value of \a __a.
1236/// \returns An integer containing the comparison results.
1237static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1238 __m128d __b) {
1239 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1240}
1241
1242/// Compares the lower double-precision floating-point values in each of
1243/// the two 128-bit floating-point vectors of [2 x double] to determine if
1244/// the value in the first parameter is unequal to the corresponding value in
1245/// the second parameter.
1246///
1247/// The comparison returns 0 for false, 1 for true. If either value in a
1248/// comparison is NaN, returns 1.
1249///
1250/// \headerfile <x86intrin.h>
1251///
1252/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1253///
1254/// \param __a
1255/// A 128-bit vector of [2 x double]. The lower double-precision value is
1256/// compared to the lower double-precision value of \a __b.
1257/// \param __b
1258/// A 128-bit vector of [2 x double]. The lower double-precision value is
1259/// compared to the lower double-precision value of \a __a.
1260/// \returns An integer containing the comparison result.
1261static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1262 __m128d __b) {
1263 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1264}
1265
1266/// Converts the two double-precision floating-point elements of a
1267/// 128-bit vector of [2 x double] into two single-precision floating-point
1268/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1269/// The upper 64 bits of the result vector are set to zero.
1270///
1271/// \headerfile <x86intrin.h>
1272///
1273/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1274///
1275/// \param __a
1276/// A 128-bit vector of [2 x double].
1277/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1278/// converted values. The upper 64 bits are set to zero.
1279static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1280 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1281}
1282
1283/// Converts the lower two single-precision floating-point elements of a
1284/// 128-bit vector of [4 x float] into two double-precision floating-point
1285/// values, returned in a 128-bit vector of [2 x double]. The upper two
1286/// elements of the input vector are unused.
1287///
1288/// \headerfile <x86intrin.h>
1289///
1290/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1291///
1292/// \param __a
1293/// A 128-bit vector of [4 x float]. The lower two single-precision
1294/// floating-point elements are converted to double-precision values. The
1295/// upper two elements are unused.
1296/// \returns A 128-bit vector of [2 x double] containing the converted values.
1297static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1298 return (__m128d) __builtin_convertvector(
1299 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1300}
1301
1302/// Converts the lower two integer elements of a 128-bit vector of
1303/// [4 x i32] into two double-precision floating-point values, returned in a
1304/// 128-bit vector of [2 x double].
1305///
1306/// The upper two elements of the input vector are unused.
1307///
1308/// \headerfile <x86intrin.h>
1309///
1310/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1311///
1312/// \param __a
1313/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1314/// converted to double-precision values.
1315///
1316/// The upper two elements are unused.
1317/// \returns A 128-bit vector of [2 x double] containing the converted values.
1318static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1319 return (__m128d) __builtin_convertvector(
1320 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1321}
1322
1323/// Converts the two double-precision floating-point elements of a
1324/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1325/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1326/// 64 bits of the result vector are set to zero.
1327///
1328/// If a converted value does not fit in a 32-bit integer, raises a
1329/// floating-point invalid exception. If the exception is masked, returns
1330/// the most negative integer.
1331///
1332/// \headerfile <x86intrin.h>
1333///
1334/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1335///
1336/// \param __a
1337/// A 128-bit vector of [2 x double].
1338/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1339/// converted values. The upper 64 bits are set to zero.
1340static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1341 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1342}
1343
1344/// Converts the low-order element of a 128-bit vector of [2 x double]
1345/// into a 32-bit signed integer value.
1346///
1347/// If the converted value does not fit in a 32-bit integer, raises a
1348/// floating-point invalid exception. If the exception is masked, returns
1349/// the most negative integer.
1350///
1351/// \headerfile <x86intrin.h>
1352///
1353/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1354///
1355/// \param __a
1356/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1357/// conversion.
1358/// \returns A 32-bit signed integer containing the converted value.
1359static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1360 return __builtin_ia32_cvtsd2si((__v2df)__a);
1361}
1362
1363/// Converts the lower double-precision floating-point element of a
1364/// 128-bit vector of [2 x double], in the second parameter, into a
1365/// single-precision floating-point value, returned in the lower 32 bits of a
1366/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1367/// copied from the upper 96 bits of the first parameter.
1368///
1369/// \headerfile <x86intrin.h>
1370///
1371/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1372///
1373/// \param __a
1374/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1375/// copied to the upper 96 bits of the result.
1376/// \param __b
1377/// A 128-bit vector of [2 x double]. The lower double-precision
1378/// floating-point element is used in the conversion.
1379/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1380/// converted value from the second parameter. The upper 96 bits are copied
1381/// from the upper 96 bits of the first parameter.
1382static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1383 __m128d __b) {
1384 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1385}
1386
1387/// Converts a 32-bit signed integer value, in the second parameter, into
1388/// a double-precision floating-point value, returned in the lower 64 bits of
1389/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1390/// are copied from the upper 64 bits of the first parameter.
1391///
1392/// \headerfile <x86intrin.h>
1393///
1394/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1395///
1396/// \param __a
1397/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1398/// copied to the upper 64 bits of the result.
1399/// \param __b
1400/// A 32-bit signed integer containing the value to be converted.
1401/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1402/// converted value from the second parameter. The upper 64 bits are copied
1403/// from the upper 64 bits of the first parameter.
1404static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1405 int __b) {
1406 __a[0] = __b;
1407 return __a;
1408}
1409
1410/// Converts the lower single-precision floating-point element of a
1411/// 128-bit vector of [4 x float], in the second parameter, into a
1412/// double-precision floating-point value, returned in the lower 64 bits of
1413/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1414/// are copied from the upper 64 bits of the first parameter.
1415///
1416/// \headerfile <x86intrin.h>
1417///
1418/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1419///
1420/// \param __a
1421/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1422/// copied to the upper 64 bits of the result.
1423/// \param __b
1424/// A 128-bit vector of [4 x float]. The lower single-precision
1425/// floating-point element is used in the conversion.
1426/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1427/// converted value from the second parameter. The upper 64 bits are copied
1428/// from the upper 64 bits of the first parameter.
1429static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1430 __m128 __b) {
1431 __a[0] = __b[0];
1432 return __a;
1433}
1434
1435/// Converts the two double-precision floating-point elements of a
1436/// 128-bit vector of [2 x double] into two signed truncated (rounded
1437/// toward zero) 32-bit integer values, returned in the lower 64 bits
1438/// of a 128-bit vector of [4 x i32].
1439///
1440/// If a converted value does not fit in a 32-bit integer, raises a
1441/// floating-point invalid exception. If the exception is masked, returns
1442/// the most negative integer.
1443///
1444/// \headerfile <x86intrin.h>
1445///
1446/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1447/// instruction.
1448///
1449/// \param __a
1450/// A 128-bit vector of [2 x double].
1451/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1452/// converted values. The upper 64 bits are set to zero.
1453static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1454 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1455}
1456
1457/// Converts the low-order element of a [2 x double] vector into a 32-bit
1458/// signed truncated (rounded toward zero) integer value.
1459///
1460/// If the converted value does not fit in a 32-bit integer, raises a
1461/// floating-point invalid exception. If the exception is masked, returns
1462/// the most negative integer.
1463///
1464/// \headerfile <x86intrin.h>
1465///
1466/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1467/// instruction.
1468///
1469/// \param __a
1470/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1471/// conversion.
1472/// \returns A 32-bit signed integer containing the converted value.
1473static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1474 return __builtin_ia32_cvttsd2si((__v2df)__a);
1475}
1476
1477/// Converts the two double-precision floating-point elements of a
1478/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1479/// returned in a 64-bit vector of [2 x i32].
1480///
1481/// If a converted value does not fit in a 32-bit integer, raises a
1482/// floating-point invalid exception. If the exception is masked, returns
1483/// the most negative integer.
1484///
1485/// \headerfile <x86intrin.h>
1486///
1487/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1488///
1489/// \param __a
1490/// A 128-bit vector of [2 x double].
1491/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1492static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1493 return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1494}
1495
1496/// Converts the two double-precision floating-point elements of a
1497/// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1498/// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1499///
1500/// If a converted value does not fit in a 32-bit integer, raises a
1501/// floating-point invalid exception. If the exception is masked, returns
1502/// the most negative integer.
1503///
1504/// \headerfile <x86intrin.h>
1505///
1506/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1507///
1508/// \param __a
1509/// A 128-bit vector of [2 x double].
1510/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1511static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1512 return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1513}
1514
1515/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1516/// [2 x i32] into two double-precision floating-point values, returned in a
1517/// 128-bit vector of [2 x double].
1518///
1519/// \headerfile <x86intrin.h>
1520///
1521/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1522///
1523/// \param __a
1524/// A 64-bit vector of [2 x i32].
1525/// \returns A 128-bit vector of [2 x double] containing the converted values.
1526static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
1527 return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1528}
1529
1530/// Returns the low-order element of a 128-bit vector of [2 x double] as
1531/// a double-precision floating-point value.
1532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// This intrinsic has no corresponding instruction.
1536///
1537/// \param __a
1538/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1539/// \returns A double-precision floating-point value copied from the lower 64
1540/// bits of \a __a.
1541static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1542 return __a[0];
1543}
1544
1545/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1546/// memory location.
1547///
1548/// \headerfile <x86intrin.h>
1549///
1550/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1551///
1552/// \param __dp
1553/// A pointer to a 128-bit memory location. The address of the memory
1554/// location has to be 16-byte aligned.
1555/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1556static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1557 return *(const __m128d *)__dp;
1558}
1559
1560/// Loads a double-precision floating-point value from a specified memory
1561/// location and duplicates it to both vector elements of a 128-bit vector of
1562/// [2 x double].
1563///
1564/// \headerfile <x86intrin.h>
1565///
1566/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1567///
1568/// \param __dp
1569/// A pointer to a memory location containing a double-precision value.
1570/// \returns A 128-bit vector of [2 x double] containing the loaded and
1571/// duplicated values.
1572static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1573 struct __mm_load1_pd_struct {
1574 double __u;
1575 } __attribute__((__packed__, __may_alias__));
1576 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1577 return __extension__(__m128d){__u, __u};
1578}
1579
1580#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1581
1582/// Loads two double-precision values, in reverse order, from an aligned
1583/// memory location into a 128-bit vector of [2 x double].
1584///
1585/// \headerfile <x86intrin.h>
1586///
1587/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1588/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1589/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1590///
1591/// \param __dp
1592/// A 16-byte aligned pointer to an array of double-precision values to be
1593/// loaded in reverse order.
1594/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1595/// values.
1596static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1597 __m128d __u = *(const __m128d *)__dp;
1598 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1599}
1600
1601/// Loads a 128-bit floating-point vector of [2 x double] from an
1602/// unaligned memory location.
1603///
1604/// \headerfile <x86intrin.h>
1605///
1606/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1607///
1608/// \param __dp
1609/// A pointer to a 128-bit memory location. The address of the memory
1610/// location does not have to be aligned.
1611/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1612static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1613 struct __loadu_pd {
1614 __m128d_u __v;
1615 } __attribute__((__packed__, __may_alias__));
1616 return ((const struct __loadu_pd *)__dp)->__v;
1617}
1618
1619/// Loads a 64-bit integer value to the low element of a 128-bit integer
1620/// vector and clears the upper element.
1621///
1622/// \headerfile <x86intrin.h>
1623///
1624/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1625///
1626/// \param __a
1627/// A pointer to a 64-bit memory location. The address of the memory
1628/// location does not have to be aligned.
1629/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1630static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1631 struct __loadu_si64 {
1632 long long __v;
1633 } __attribute__((__packed__, __may_alias__));
1634 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1635 return __extension__(__m128i)(__v2di){__u, 0LL};
1636}
1637
1638/// Loads a 32-bit integer value to the low element of a 128-bit integer
1639/// vector and clears the upper element.
1640///
1641/// \headerfile <x86intrin.h>
1642///
1643/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1644///
1645/// \param __a
1646/// A pointer to a 32-bit memory location. The address of the memory
1647/// location does not have to be aligned.
1648/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1649static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1650 struct __loadu_si32 {
1651 int __v;
1652 } __attribute__((__packed__, __may_alias__));
1653 int __u = ((const struct __loadu_si32 *)__a)->__v;
1654 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1655}
1656
1657/// Loads a 16-bit integer value to the low element of a 128-bit integer
1658/// vector and clears the upper element.
1659///
1660/// \headerfile <x86intrin.h>
1661///
1662/// This intrinsic does not correspond to a specific instruction.
1663///
1664/// \param __a
1665/// A pointer to a 16-bit memory location. The address of the memory
1666/// location does not have to be aligned.
1667/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1668static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1669 struct __loadu_si16 {
1670 short __v;
1671 } __attribute__((__packed__, __may_alias__));
1672 short __u = ((const struct __loadu_si16 *)__a)->__v;
1673 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1674}
1675
1676/// Loads a 64-bit double-precision value to the low element of a
1677/// 128-bit integer vector and clears the upper element.
1678///
1679/// \headerfile <x86intrin.h>
1680///
1681/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1682///
1683/// \param __dp
1684/// A pointer to a memory location containing a double-precision value.
1685/// The address of the memory location does not have to be aligned.
1686/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1687static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1688 struct __mm_load_sd_struct {
1689 double __u;
1690 } __attribute__((__packed__, __may_alias__));
1691 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1692 return __extension__(__m128d){__u, 0};
1693}
1694
1695/// Loads a double-precision value into the high-order bits of a 128-bit
1696/// vector of [2 x double]. The low-order bits are copied from the low-order
1697/// bits of the first operand.
1698///
1699/// \headerfile <x86intrin.h>
1700///
1701/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1702///
1703/// \param __a
1704/// A 128-bit vector of [2 x double]. \n
1705/// Bits [63:0] are written to bits [63:0] of the result.
1706/// \param __dp
1707/// A pointer to a 64-bit memory location containing a double-precision
1708/// floating-point value that is loaded. The loaded value is written to bits
1709/// [127:64] of the result. The address of the memory location does not have
1710/// to be aligned.
1711/// \returns A 128-bit vector of [2 x double] containing the moved values.
1712static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1713 double const *__dp) {
1714 struct __mm_loadh_pd_struct {
1715 double __u;
1716 } __attribute__((__packed__, __may_alias__));
1717 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1718 return __extension__(__m128d){__a[0], __u};
1719}
1720
1721/// Loads a double-precision value into the low-order bits of a 128-bit
1722/// vector of [2 x double]. The high-order bits are copied from the
1723/// high-order bits of the first operand.
1724///
1725/// \headerfile <x86intrin.h>
1726///
1727/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1728///
1729/// \param __a
1730/// A 128-bit vector of [2 x double]. \n
1731/// Bits [127:64] are written to bits [127:64] of the result.
1732/// \param __dp
1733/// A pointer to a 64-bit memory location containing a double-precision
1734/// floating-point value that is loaded. The loaded value is written to bits
1735/// [63:0] of the result. The address of the memory location does not have to
1736/// be aligned.
1737/// \returns A 128-bit vector of [2 x double] containing the moved values.
1738static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1739 double const *__dp) {
1740 struct __mm_loadl_pd_struct {
1741 double __u;
1742 } __attribute__((__packed__, __may_alias__));
1743 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1744 return __extension__(__m128d){__u, __a[1]};
1745}
1746
1747/// Constructs a 128-bit floating-point vector of [2 x double] with
1748/// unspecified content. This could be used as an argument to another
1749/// intrinsic function where the argument is required but the value is not
1750/// actually used.
1751///
1752/// \headerfile <x86intrin.h>
1753///
1754/// This intrinsic has no corresponding instruction.
1755///
1756/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1757/// content.
1758static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1759 return (__m128d)__builtin_ia32_undef128();
1760}
1761
1762/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1763/// 64 bits of the vector are initialized with the specified double-precision
1764/// floating-point value. The upper 64 bits are set to zero.
1765///
1766/// \headerfile <x86intrin.h>
1767///
1768/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1769///
1770/// \param __w
1771/// A double-precision floating-point value used to initialize the lower 64
1772/// bits of the result.
1773/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1774/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1775/// set to zero.
1776static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1777 return __extension__(__m128d){__w, 0};
1778}
1779
1780/// Constructs a 128-bit floating-point vector of [2 x double], with each
1781/// of the two double-precision floating-point vector elements set to the
1782/// specified double-precision floating-point value.
1783///
1784/// \headerfile <x86intrin.h>
1785///
1786/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1787///
1788/// \param __w
1789/// A double-precision floating-point value used to initialize each vector
1790/// element of the result.
1791/// \returns An initialized 128-bit floating-point vector of [2 x double].
1792static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1793 return __extension__(__m128d){__w, __w};
1794}
1795
1796/// Constructs a 128-bit floating-point vector of [2 x double], with each
1797/// of the two double-precision floating-point vector elements set to the
1798/// specified double-precision floating-point value.
1799///
1800/// \headerfile <x86intrin.h>
1801///
1802/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1803///
1804/// \param __w
1805/// A double-precision floating-point value used to initialize each vector
1806/// element of the result.
1807/// \returns An initialized 128-bit floating-point vector of [2 x double].
1808static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1809 return _mm_set1_pd(__w);
1810}
1811
1812/// Constructs a 128-bit floating-point vector of [2 x double]
1813/// initialized with the specified double-precision floating-point values.
1814///
1815/// \headerfile <x86intrin.h>
1816///
1817/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1818///
1819/// \param __w
1820/// A double-precision floating-point value used to initialize the upper 64
1821/// bits of the result.
1822/// \param __x
1823/// A double-precision floating-point value used to initialize the lower 64
1824/// bits of the result.
1825/// \returns An initialized 128-bit floating-point vector of [2 x double].
1826static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1827 double __x) {
1828 return __extension__(__m128d){__x, __w};
1829}
1830
1831/// Constructs a 128-bit floating-point vector of [2 x double],
1832/// initialized in reverse order with the specified double-precision
1833/// floating-point values.
1834///
1835/// \headerfile <x86intrin.h>
1836///
1837/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1838///
1839/// \param __w
1840/// A double-precision floating-point value used to initialize the lower 64
1841/// bits of the result.
1842/// \param __x
1843/// A double-precision floating-point value used to initialize the upper 64
1844/// bits of the result.
1845/// \returns An initialized 128-bit floating-point vector of [2 x double].
1846static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1847 double __x) {
1848 return __extension__(__m128d){__w, __x};
1849}
1850
1851/// Constructs a 128-bit floating-point vector of [2 x double]
1852/// initialized to zero.
1853///
1854/// \headerfile <x86intrin.h>
1855///
1856/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1857///
1858/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1859/// all elements set to zero.
1860static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1861 return __extension__(__m128d){0.0, 0.0};
1862}
1863
1864/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1865/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1866/// 64 bits are set to the upper 64 bits of the first parameter.
1867///
1868/// \headerfile <x86intrin.h>
1869///
1870/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1871///
1872/// \param __a
1873/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1874/// upper 64 bits of the result.
1875/// \param __b
1876/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1877/// lower 64 bits of the result.
1878/// \returns A 128-bit vector of [2 x double] containing the moved values.
1879static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1880 __m128d __b) {
1881 __a[0] = __b[0];
1882 return __a;
1883}
1884
1885/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1886/// memory location.
1887///
1888/// \headerfile <x86intrin.h>
1889///
1890/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1891///
1892/// \param __dp
1893/// A pointer to a 64-bit memory location.
1894/// \param __a
1895/// A 128-bit vector of [2 x double] containing the value to be stored.
1896static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1897 __m128d __a) {
1898 struct __mm_store_sd_struct {
1899 double __u;
1900 } __attribute__((__packed__, __may_alias__));
1901 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1902}
1903
1904/// Moves packed double-precision values from a 128-bit vector of
1905/// [2 x double] to a memory location.
1906///
1907/// \headerfile <x86intrin.h>
1908///
1909/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1910///
1911/// \param __dp
1912/// A pointer to an aligned memory location that can store two
1913/// double-precision values.
1914/// \param __a
1915/// A packed 128-bit vector of [2 x double] containing the values to be
1916/// moved.
1917static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1918 __m128d __a) {
1919 *(__m128d *)__dp = __a;
1920}
1921
1922/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1923/// the upper and lower 64 bits of a memory location.
1924///
1925/// \headerfile <x86intrin.h>
1926///
1927/// This intrinsic corresponds to the
1928/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1929///
1930/// \param __dp
1931/// A pointer to a memory location that can store two double-precision
1932/// values.
1933/// \param __a
1934/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1935/// of the values in \a __dp.
1936static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1937 __m128d __a) {
1938 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1939 _mm_store_pd(__dp, __a);
1940}
1941
1942/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1943/// the upper and lower 64 bits of a memory location.
1944///
1945/// \headerfile <x86intrin.h>
1946///
1947/// This intrinsic corresponds to the
1948/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1949///
1950/// \param __dp
1951/// A pointer to a memory location that can store two double-precision
1952/// values.
1953/// \param __a
1954/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1955/// of the values in \a __dp.
1956static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1957 __m128d __a) {
1958 _mm_store1_pd(__dp, __a);
1959}
1960
1961/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1962/// location.
1963///
1964/// \headerfile <x86intrin.h>
1965///
1966/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1967///
1968/// \param __dp
1969/// A pointer to a 128-bit memory location. The address of the memory
1970/// location does not have to be aligned.
1971/// \param __a
1972/// A 128-bit vector of [2 x double] containing the values to be stored.
1973static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1974 __m128d __a) {
1975 struct __storeu_pd {
1976 __m128d_u __v;
1977 } __attribute__((__packed__, __may_alias__));
1978 ((struct __storeu_pd *)__dp)->__v = __a;
1979}
1980
1981/// Stores two double-precision values, in reverse order, from a 128-bit
1982/// vector of [2 x double] to a 16-byte aligned memory location.
1983///
1984/// \headerfile <x86intrin.h>
1985///
1986/// This intrinsic corresponds to a shuffling instruction followed by a
1987/// <c> VMOVAPD / MOVAPD </c> instruction.
1988///
1989/// \param __dp
1990/// A pointer to a 16-byte aligned memory location that can store two
1991/// double-precision values.
1992/// \param __a
1993/// A 128-bit vector of [2 x double] containing the values to be reversed and
1994/// stored.
1995static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1996 __m128d __a) {
1997 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1998 *(__m128d *)__dp = __a;
1999}
2000
2001/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2002/// memory location.
2003///
2004/// \headerfile <x86intrin.h>
2005///
2006/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2007///
2008/// \param __dp
2009/// A pointer to a 64-bit memory location.
2010/// \param __a
2011/// A 128-bit vector of [2 x double] containing the value to be stored.
2012static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2013 __m128d __a) {
2014 struct __mm_storeh_pd_struct {
2015 double __u;
2016 } __attribute__((__packed__, __may_alias__));
2017 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2018}
2019
2020/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2021/// memory location.
2022///
2023/// \headerfile <x86intrin.h>
2024///
2025/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2026///
2027/// \param __dp
2028/// A pointer to a 64-bit memory location.
2029/// \param __a
2030/// A 128-bit vector of [2 x double] containing the value to be stored.
2031static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2032 __m128d __a) {
2033 struct __mm_storeh_pd_struct {
2034 double __u;
2035 } __attribute__((__packed__, __may_alias__));
2036 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2037}
2038
2039/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2040/// saving the lower 8 bits of each sum in the corresponding element of a
2041/// 128-bit result vector of [16 x i8].
2042///
2043/// The integer elements of both parameters can be either signed or unsigned.
2044///
2045/// \headerfile <x86intrin.h>
2046///
2047/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2048///
2049/// \param __a
2050/// A 128-bit vector of [16 x i8].
2051/// \param __b
2052/// A 128-bit vector of [16 x i8].
2053/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2054/// parameters.
2055static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2056 __m128i __b) {
2057 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2058}
2059
2060/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2061/// saving the lower 16 bits of each sum in the corresponding element of a
2062/// 128-bit result vector of [8 x i16].
2063///
2064/// The integer elements of both parameters can be either signed or unsigned.
2065///
2066/// \headerfile <x86intrin.h>
2067///
2068/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2069///
2070/// \param __a
2071/// A 128-bit vector of [8 x i16].
2072/// \param __b
2073/// A 128-bit vector of [8 x i16].
2074/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2075/// parameters.
2076static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2077 __m128i __b) {
2078 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2079}
2080
2081/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2082/// saving the lower 32 bits of each sum in the corresponding element of a
2083/// 128-bit result vector of [4 x i32].
2084///
2085/// The integer elements of both parameters can be either signed or unsigned.
2086///
2087/// \headerfile <x86intrin.h>
2088///
2089/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2090///
2091/// \param __a
2092/// A 128-bit vector of [4 x i32].
2093/// \param __b
2094/// A 128-bit vector of [4 x i32].
2095/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2096/// parameters.
2097static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2098 __m128i __b) {
2099 return (__m128i)((__v4su)__a + (__v4su)__b);
2100}
2101
2102/// Adds two signed or unsigned 64-bit integer values, returning the
2103/// lower 64 bits of the sum.
2104///
2105/// \headerfile <x86intrin.h>
2106///
2107/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2108///
2109/// \param __a
2110/// A 64-bit integer.
2111/// \param __b
2112/// A 64-bit integer.
2113/// \returns A 64-bit integer containing the sum of both parameters.
2114static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2115 return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2116}
2117
2118/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2119/// saving the lower 64 bits of each sum in the corresponding element of a
2120/// 128-bit result vector of [2 x i64].
2121///
2122/// The integer elements of both parameters can be either signed or unsigned.
2123///
2124/// \headerfile <x86intrin.h>
2125///
2126/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2127///
2128/// \param __a
2129/// A 128-bit vector of [2 x i64].
2130/// \param __b
2131/// A 128-bit vector of [2 x i64].
2132/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2133/// parameters.
2134static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2135 __m128i __b) {
2136 return (__m128i)((__v2du)__a + (__v2du)__b);
2137}
2138
2139/// Adds, with saturation, the corresponding elements of two 128-bit
2140/// signed [16 x i8] vectors, saving each sum in the corresponding element
2141/// of a 128-bit result vector of [16 x i8].
2142///
2143/// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2144/// less than 0x80 are saturated to 0x80.
2145///
2146/// \headerfile <x86intrin.h>
2147///
2148/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2149///
2150/// \param __a
2151/// A 128-bit signed [16 x i8] vector.
2152/// \param __b
2153/// A 128-bit signed [16 x i8] vector.
2154/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2155/// both parameters.
2156static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2157 __m128i __b) {
2158 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2159}
2160
2161/// Adds, with saturation, the corresponding elements of two 128-bit
2162/// signed [8 x i16] vectors, saving each sum in the corresponding element
2163/// of a 128-bit result vector of [8 x i16].
2164///
2165/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2166/// less than 0x8000 are saturated to 0x8000.
2167///
2168/// \headerfile <x86intrin.h>
2169///
2170/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2171///
2172/// \param __a
2173/// A 128-bit signed [8 x i16] vector.
2174/// \param __b
2175/// A 128-bit signed [8 x i16] vector.
2176/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2177/// both parameters.
2178static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2179 __m128i __b) {
2180 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2181}
2182
2183/// Adds, with saturation, the corresponding elements of two 128-bit
2184/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2185/// of a 128-bit result vector of [16 x i8].
2186///
2187/// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2188/// saturated to 0x00.
2189///
2190/// \headerfile <x86intrin.h>
2191///
2192/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2193///
2194/// \param __a
2195/// A 128-bit unsigned [16 x i8] vector.
2196/// \param __b
2197/// A 128-bit unsigned [16 x i8] vector.
2198/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2199/// of both parameters.
2200static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2201 __m128i __b) {
2202 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2203}
2204
2205/// Adds, with saturation, the corresponding elements of two 128-bit
2206/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2207/// of a 128-bit result vector of [8 x i16].
2208///
2209/// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2210/// are saturated to 0x0000.
2211///
2212/// \headerfile <x86intrin.h>
2213///
2214/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2215///
2216/// \param __a
2217/// A 128-bit unsigned [8 x i16] vector.
2218/// \param __b
2219/// A 128-bit unsigned [8 x i16] vector.
2220/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2221/// of both parameters.
2222static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2223 __m128i __b) {
2224 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2225}
2226
2227/// Computes the rounded averages of corresponding elements of two
2228/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2229/// corresponding element of a 128-bit result vector of [16 x i8].
2230///
2231/// \headerfile <x86intrin.h>
2232///
2233/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2234///
2235/// \param __a
2236/// A 128-bit unsigned [16 x i8] vector.
2237/// \param __b
2238/// A 128-bit unsigned [16 x i8] vector.
2239/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2240/// averages of both parameters.
2241static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2242 __m128i __b) {
2243 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2244}
2245
2246/// Computes the rounded averages of corresponding elements of two
2247/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2248/// corresponding element of a 128-bit result vector of [8 x i16].
2249///
2250/// \headerfile <x86intrin.h>
2251///
2252/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2253///
2254/// \param __a
2255/// A 128-bit unsigned [8 x i16] vector.
2256/// \param __b
2257/// A 128-bit unsigned [8 x i16] vector.
2258/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2259/// averages of both parameters.
2260static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2261 __m128i __b) {
2262 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2263}
2264
2265/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2266/// vectors, producing eight intermediate 32-bit signed integer products, and
2267/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2268/// [4 x i32] vector.
2269///
2270/// For example, bits [15:0] of both parameters are multiplied producing a
2271/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2272/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2273/// of the result.
2274///
2275/// \headerfile <x86intrin.h>
2276///
2277/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2278///
2279/// \param __a
2280/// A 128-bit signed [8 x i16] vector.
2281/// \param __b
2282/// A 128-bit signed [8 x i16] vector.
2283/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2284/// of both parameters.
2285static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2286 __m128i __b) {
2287 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2288}
2289
2290/// Compares corresponding elements of two 128-bit signed [8 x i16]
2291/// vectors, saving the greater value from each comparison in the
2292/// corresponding element of a 128-bit result vector of [8 x i16].
2293///
2294/// \headerfile <x86intrin.h>
2295///
2296/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2297///
2298/// \param __a
2299/// A 128-bit signed [8 x i16] vector.
2300/// \param __b
2301/// A 128-bit signed [8 x i16] vector.
2302/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2303/// each comparison.
2304static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2305 __m128i __b) {
2306 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2307}
2308
2309/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2310/// vectors, saving the greater value from each comparison in the
2311/// corresponding element of a 128-bit result vector of [16 x i8].
2312///
2313/// \headerfile <x86intrin.h>
2314///
2315/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2316///
2317/// \param __a
2318/// A 128-bit unsigned [16 x i8] vector.
2319/// \param __b
2320/// A 128-bit unsigned [16 x i8] vector.
2321/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2322/// each comparison.
2323static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2324 __m128i __b) {
2325 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2326}
2327
2328/// Compares corresponding elements of two 128-bit signed [8 x i16]
2329/// vectors, saving the smaller value from each comparison in the
2330/// corresponding element of a 128-bit result vector of [8 x i16].
2331///
2332/// \headerfile <x86intrin.h>
2333///
2334/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2335///
2336/// \param __a
2337/// A 128-bit signed [8 x i16] vector.
2338/// \param __b
2339/// A 128-bit signed [8 x i16] vector.
2340/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2341/// each comparison.
2342static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2343 __m128i __b) {
2344 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2345}
2346
2347/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2348/// vectors, saving the smaller value from each comparison in the
2349/// corresponding element of a 128-bit result vector of [16 x i8].
2350///
2351/// \headerfile <x86intrin.h>
2352///
2353/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2354///
2355/// \param __a
2356/// A 128-bit unsigned [16 x i8] vector.
2357/// \param __b
2358/// A 128-bit unsigned [16 x i8] vector.
2359/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2360/// each comparison.
2361static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2362 __m128i __b) {
2363 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2364}
2365
2366/// Multiplies the corresponding elements of two signed [8 x i16]
2367/// vectors, saving the upper 16 bits of each 32-bit product in the
2368/// corresponding element of a 128-bit signed [8 x i16] result vector.
2369///
2370/// \headerfile <x86intrin.h>
2371///
2372/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2373///
2374/// \param __a
2375/// A 128-bit signed [8 x i16] vector.
2376/// \param __b
2377/// A 128-bit signed [8 x i16] vector.
2378/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2379/// each of the eight 32-bit products.
2380static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2381 __m128i __b) {
2382 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2383}
2384
2385/// Multiplies the corresponding elements of two unsigned [8 x i16]
2386/// vectors, saving the upper 16 bits of each 32-bit product in the
2387/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2388///
2389/// \headerfile <x86intrin.h>
2390///
2391/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2392///
2393/// \param __a
2394/// A 128-bit unsigned [8 x i16] vector.
2395/// \param __b
2396/// A 128-bit unsigned [8 x i16] vector.
2397/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2398/// of each of the eight 32-bit products.
2399static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2400 __m128i __b) {
2401 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2402}
2403
2404/// Multiplies the corresponding elements of two signed [8 x i16]
2405/// vectors, saving the lower 16 bits of each 32-bit product in the
2406/// corresponding element of a 128-bit signed [8 x i16] result vector.
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2411///
2412/// \param __a
2413/// A 128-bit signed [8 x i16] vector.
2414/// \param __b
2415/// A 128-bit signed [8 x i16] vector.
2416/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2417/// each of the eight 32-bit products.
2418static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2419 __m128i __b) {
2420 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2421}
2422
2423/// Multiplies 32-bit unsigned integer values contained in the lower bits
2424/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2425/// product.
2426///
2427/// \headerfile <x86intrin.h>
2428///
2429/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2430///
2431/// \param __a
2432/// A 64-bit integer containing one of the source operands.
2433/// \param __b
2434/// A 64-bit integer containing one of the source operands.
2435/// \returns A 64-bit integer vector containing the product of both operands.
2436static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2437 return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2438 (__v4si)__anyext128(__b)));
2439}
2440
2441/// Multiplies 32-bit unsigned integer values contained in the lower
2442/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2443/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2444///
2445/// \headerfile <x86intrin.h>
2446///
2447/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2448///
2449/// \param __a
2450/// A [2 x i64] vector containing one of the source operands.
2451/// \param __b
2452/// A [2 x i64] vector containing one of the source operands.
2453/// \returns A [2 x i64] vector containing the product of both operands.
2454static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2455 __m128i __b) {
2456 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2457}
2458
2459/// Computes the absolute differences of corresponding 8-bit integer
2460/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2461/// separately sums the second 8 absolute differences. Packs these two
2462/// unsigned 16-bit integer sums into the upper and lower elements of a
2463/// [2 x i64] vector.
2464///
2465/// \headerfile <x86intrin.h>
2466///
2467/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2468///
2469/// \param __a
2470/// A 128-bit integer vector containing one of the source operands.
2471/// \param __b
2472/// A 128-bit integer vector containing one of the source operands.
2473/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2474/// differences between both operands.
2475static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2476 __m128i __b) {
2477 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2478}
2479
2480/// Subtracts the corresponding 8-bit integer values in the operands.
2481///
2482/// \headerfile <x86intrin.h>
2483///
2484/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2485///
2486/// \param __a
2487/// A 128-bit integer vector containing the minuends.
2488/// \param __b
2489/// A 128-bit integer vector containing the subtrahends.
2490/// \returns A 128-bit integer vector containing the differences of the values
2491/// in the operands.
2492static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2493 __m128i __b) {
2494 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2495}
2496
2497/// Subtracts the corresponding 16-bit integer values in the operands.
2498///
2499/// \headerfile <x86intrin.h>
2500///
2501/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2502///
2503/// \param __a
2504/// A 128-bit integer vector containing the minuends.
2505/// \param __b
2506/// A 128-bit integer vector containing the subtrahends.
2507/// \returns A 128-bit integer vector containing the differences of the values
2508/// in the operands.
2509static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2510 __m128i __b) {
2511 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2512}
2513
2514/// Subtracts the corresponding 32-bit integer values in the operands.
2515///
2516/// \headerfile <x86intrin.h>
2517///
2518/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2519///
2520/// \param __a
2521/// A 128-bit integer vector containing the minuends.
2522/// \param __b
2523/// A 128-bit integer vector containing the subtrahends.
2524/// \returns A 128-bit integer vector containing the differences of the values
2525/// in the operands.
2526static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2527 __m128i __b) {
2528 return (__m128i)((__v4su)__a - (__v4su)__b);
2529}
2530
2531/// Subtracts signed or unsigned 64-bit integer values and writes the
2532/// difference to the corresponding bits in the destination.
2533///
2534/// \headerfile <x86intrin.h>
2535///
2536/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2537///
2538/// \param __a
2539/// A 64-bit integer vector containing the minuend.
2540/// \param __b
2541/// A 64-bit integer vector containing the subtrahend.
2542/// \returns A 64-bit integer vector containing the difference of the values in
2543/// the operands.
2544static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2545 return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2546}
2547
2548/// Subtracts the corresponding elements of two [2 x i64] vectors.
2549///
2550/// \headerfile <x86intrin.h>
2551///
2552/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2553///
2554/// \param __a
2555/// A 128-bit integer vector containing the minuends.
2556/// \param __b
2557/// A 128-bit integer vector containing the subtrahends.
2558/// \returns A 128-bit integer vector containing the differences of the values
2559/// in the operands.
2560static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2561 __m128i __b) {
2562 return (__m128i)((__v2du)__a - (__v2du)__b);
2563}
2564
2565/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2566/// the input and returns the differences in the corresponding bytes in the
2567/// destination.
2568///
2569/// Differences greater than 0x7F are saturated to 0x7F, and differences
2570/// less than 0x80 are saturated to 0x80.
2571///
2572/// \headerfile <x86intrin.h>
2573///
2574/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2575///
2576/// \param __a
2577/// A 128-bit integer vector containing the minuends.
2578/// \param __b
2579/// A 128-bit integer vector containing the subtrahends.
2580/// \returns A 128-bit integer vector containing the differences of the values
2581/// in the operands.
2582static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2583 __m128i __b) {
2584 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2585}
2586
2587/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2588/// the input and returns the differences in the corresponding bytes in the
2589/// destination.
2590///
2591/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2592/// than 0x8000 are saturated to 0x8000.
2593///
2594/// \headerfile <x86intrin.h>
2595///
2596/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2597///
2598/// \param __a
2599/// A 128-bit integer vector containing the minuends.
2600/// \param __b
2601/// A 128-bit integer vector containing the subtrahends.
2602/// \returns A 128-bit integer vector containing the differences of the values
2603/// in the operands.
2604static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2605 __m128i __b) {
2606 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2607}
2608
2609/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2610/// the input and returns the differences in the corresponding bytes in the
2611/// destination.
2612///
2613/// Differences less than 0x00 are saturated to 0x00.
2614///
2615/// \headerfile <x86intrin.h>
2616///
2617/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2618///
2619/// \param __a
2620/// A 128-bit integer vector containing the minuends.
2621/// \param __b
2622/// A 128-bit integer vector containing the subtrahends.
2623/// \returns A 128-bit integer vector containing the unsigned integer
2624/// differences of the values in the operands.
2625static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2626 __m128i __b) {
2627 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2628}
2629
2630/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2631/// the input and returns the differences in the corresponding bytes in the
2632/// destination.
2633///
2634/// Differences less than 0x0000 are saturated to 0x0000.
2635///
2636/// \headerfile <x86intrin.h>
2637///
2638/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2639///
2640/// \param __a
2641/// A 128-bit integer vector containing the minuends.
2642/// \param __b
2643/// A 128-bit integer vector containing the subtrahends.
2644/// \returns A 128-bit integer vector containing the unsigned integer
2645/// differences of the values in the operands.
2646static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2647 __m128i __b) {
2648 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2649}
2650
2651/// Performs a bitwise AND of two 128-bit integer vectors.
2652///
2653/// \headerfile <x86intrin.h>
2654///
2655/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2656///
2657/// \param __a
2658/// A 128-bit integer vector containing one of the source operands.
2659/// \param __b
2660/// A 128-bit integer vector containing one of the source operands.
2661/// \returns A 128-bit integer vector containing the bitwise AND of the values
2662/// in both operands.
2663static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2664 __m128i __b) {
2665 return (__m128i)((__v2du)__a & (__v2du)__b);
2666}
2667
2668/// Performs a bitwise AND of two 128-bit integer vectors, using the
2669/// one's complement of the values contained in the first source operand.
2670///
2671/// \headerfile <x86intrin.h>
2672///
2673/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2674///
2675/// \param __a
2676/// A 128-bit vector containing the left source operand. The one's complement
2677/// of this value is used in the bitwise AND.
2678/// \param __b
2679/// A 128-bit vector containing the right source operand.
2680/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2681/// complement of the first operand and the values in the second operand.
2682static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2683 __m128i __b) {
2684 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2685}
2686/// Performs a bitwise OR of two 128-bit integer vectors.
2687///
2688/// \headerfile <x86intrin.h>
2689///
2690/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2691///
2692/// \param __a
2693/// A 128-bit integer vector containing one of the source operands.
2694/// \param __b
2695/// A 128-bit integer vector containing one of the source operands.
2696/// \returns A 128-bit integer vector containing the bitwise OR of the values
2697/// in both operands.
2698static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2699 __m128i __b) {
2700 return (__m128i)((__v2du)__a | (__v2du)__b);
2701}
2702
2703/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2704///
2705/// \headerfile <x86intrin.h>
2706///
2707/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2708///
2709/// \param __a
2710/// A 128-bit integer vector containing one of the source operands.
2711/// \param __b
2712/// A 128-bit integer vector containing one of the source operands.
2713/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2714/// values in both operands.
2715static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2716 __m128i __b) {
2717 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2718}
2719
2720/// Left-shifts the 128-bit integer vector operand by the specified
2721/// number of bytes. Low-order bits are cleared.
2722///
2723/// \headerfile <x86intrin.h>
2724///
2725/// \code
2726/// __m128i _mm_slli_si128(__m128i a, const int imm);
2727/// \endcode
2728///
2729/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2730///
2731/// \param a
2732/// A 128-bit integer vector containing the source operand.
2733/// \param imm
2734/// An immediate value specifying the number of bytes to left-shift operand
2735/// \a a.
2736/// \returns A 128-bit integer vector containing the left-shifted value.
2737#define _mm_slli_si128(a, imm) \
2738 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2739 (int)(imm)))
2740
2741#define _mm_bslli_si128(a, imm) \
2742 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2743 (int)(imm)))
2744
2745/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2746/// by the specified number of bits. Low-order bits are cleared.
2747///
2748/// \headerfile <x86intrin.h>
2749///
2750/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2751///
2752/// \param __a
2753/// A 128-bit integer vector containing the source operand.
2754/// \param __count
2755/// An integer value specifying the number of bits to left-shift each value
2756/// in operand \a __a.
2757/// \returns A 128-bit integer vector containing the left-shifted values.
2758static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2759 int __count) {
2760 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2761}
2762
2763/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2764/// by the specified number of bits. Low-order bits are cleared.
2765///
2766/// \headerfile <x86intrin.h>
2767///
2768/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2769///
2770/// \param __a
2771/// A 128-bit integer vector containing the source operand.
2772/// \param __count
2773/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2774/// to left-shift each value in operand \a __a.
2775/// \returns A 128-bit integer vector containing the left-shifted values.
2776static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2777 __m128i __count) {
2778 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2779}
2780
2781/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2782/// by the specified number of bits. Low-order bits are cleared.
2783///
2784/// \headerfile <x86intrin.h>
2785///
2786/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2787///
2788/// \param __a
2789/// A 128-bit integer vector containing the source operand.
2790/// \param __count
2791/// An integer value specifying the number of bits to left-shift each value
2792/// in operand \a __a.
2793/// \returns A 128-bit integer vector containing the left-shifted values.
2794static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2795 int __count) {
2796 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2797}
2798
2799/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2800/// by the specified number of bits. Low-order bits are cleared.
2801///
2802/// \headerfile <x86intrin.h>
2803///
2804/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2805///
2806/// \param __a
2807/// A 128-bit integer vector containing the source operand.
2808/// \param __count
2809/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2810/// to left-shift each value in operand \a __a.
2811/// \returns A 128-bit integer vector containing the left-shifted values.
2812static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2813 __m128i __count) {
2814 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2815}
2816
2817/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2818/// by the specified number of bits. Low-order bits are cleared.
2819///
2820/// \headerfile <x86intrin.h>
2821///
2822/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2823///
2824/// \param __a
2825/// A 128-bit integer vector containing the source operand.
2826/// \param __count
2827/// An integer value specifying the number of bits to left-shift each value
2828/// in operand \a __a.
2829/// \returns A 128-bit integer vector containing the left-shifted values.
2830static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2831 int __count) {
2832 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2833}
2834
2835/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2836/// by the specified number of bits. Low-order bits are cleared.
2837///
2838/// \headerfile <x86intrin.h>
2839///
2840/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2841///
2842/// \param __a
2843/// A 128-bit integer vector containing the source operand.
2844/// \param __count
2845/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2846/// to left-shift each value in operand \a __a.
2847/// \returns A 128-bit integer vector containing the left-shifted values.
2848static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2849 __m128i __count) {
2850 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2851}
2852
2853/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2854/// by the specified number of bits. High-order bits are filled with the sign
2855/// bit of the initial value.
2856///
2857/// \headerfile <x86intrin.h>
2858///
2859/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2860///
2861/// \param __a
2862/// A 128-bit integer vector containing the source operand.
2863/// \param __count
2864/// An integer value specifying the number of bits to right-shift each value
2865/// in operand \a __a.
2866/// \returns A 128-bit integer vector containing the right-shifted values.
2867static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2868 int __count) {
2869 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2870}
2871
2872/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2873/// by the specified number of bits. High-order bits are filled with the sign
2874/// bit of the initial value.
2875///
2876/// \headerfile <x86intrin.h>
2877///
2878/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2879///
2880/// \param __a
2881/// A 128-bit integer vector containing the source operand.
2882/// \param __count
2883/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2884/// to right-shift each value in operand \a __a.
2885/// \returns A 128-bit integer vector containing the right-shifted values.
2886static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2887 __m128i __count) {
2888 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2889}
2890
2891/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2892/// by the specified number of bits. High-order bits are filled with the sign
2893/// bit of the initial value.
2894///
2895/// \headerfile <x86intrin.h>
2896///
2897/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2898///
2899/// \param __a
2900/// A 128-bit integer vector containing the source operand.
2901/// \param __count
2902/// An integer value specifying the number of bits to right-shift each value
2903/// in operand \a __a.
2904/// \returns A 128-bit integer vector containing the right-shifted values.
2905static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2906 int __count) {
2907 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2908}
2909
2910/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2911/// by the specified number of bits. High-order bits are filled with the sign
2912/// bit of the initial value.
2913///
2914/// \headerfile <x86intrin.h>
2915///
2916/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2917///
2918/// \param __a
2919/// A 128-bit integer vector containing the source operand.
2920/// \param __count
2921/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2922/// to right-shift each value in operand \a __a.
2923/// \returns A 128-bit integer vector containing the right-shifted values.
2924static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2925 __m128i __count) {
2926 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2927}
2928
2929/// Right-shifts the 128-bit integer vector operand by the specified
2930/// number of bytes. High-order bits are cleared.
2931///
2932/// \headerfile <x86intrin.h>
2933///
2934/// \code
2935/// __m128i _mm_srli_si128(__m128i a, const int imm);
2936/// \endcode
2937///
2938/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2939///
2940/// \param a
2941/// A 128-bit integer vector containing the source operand.
2942/// \param imm
2943/// An immediate value specifying the number of bytes to right-shift operand
2944/// \a a.
2945/// \returns A 128-bit integer vector containing the right-shifted value.
2946#define _mm_srli_si128(a, imm) \
2947 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2948 (int)(imm)))
2949
2950#define _mm_bsrli_si128(a, imm) \
2951 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2952 (int)(imm)))
2953
2954/// Right-shifts each of 16-bit values in the 128-bit integer vector
2955/// operand by the specified number of bits. High-order bits are cleared.
2956///
2957/// \headerfile <x86intrin.h>
2958///
2959/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2960///
2961/// \param __a
2962/// A 128-bit integer vector containing the source operand.
2963/// \param __count
2964/// An integer value specifying the number of bits to right-shift each value
2965/// in operand \a __a.
2966/// \returns A 128-bit integer vector containing the right-shifted values.
2967static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2968 int __count) {
2969 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2970}
2971
2972/// Right-shifts each of 16-bit values in the 128-bit integer vector
2973/// operand by the specified number of bits. High-order bits are cleared.
2974///
2975/// \headerfile <x86intrin.h>
2976///
2977/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2978///
2979/// \param __a
2980/// A 128-bit integer vector containing the source operand.
2981/// \param __count
2982/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2983/// to right-shift each value in operand \a __a.
2984/// \returns A 128-bit integer vector containing the right-shifted values.
2985static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2986 __m128i __count) {
2987 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2988}
2989
2990/// Right-shifts each of 32-bit values in the 128-bit integer vector
2991/// operand by the specified number of bits. High-order bits are cleared.
2992///
2993/// \headerfile <x86intrin.h>
2994///
2995/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2996///
2997/// \param __a
2998/// A 128-bit integer vector containing the source operand.
2999/// \param __count
3000/// An integer value specifying the number of bits to right-shift each value
3001/// in operand \a __a.
3002/// \returns A 128-bit integer vector containing the right-shifted values.
3003static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3004 int __count) {
3005 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3006}
3007
3008/// Right-shifts each of 32-bit values in the 128-bit integer vector
3009/// operand by the specified number of bits. High-order bits are cleared.
3010///
3011/// \headerfile <x86intrin.h>
3012///
3013/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3014///
3015/// \param __a
3016/// A 128-bit integer vector containing the source operand.
3017/// \param __count
3018/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3019/// to right-shift each value in operand \a __a.
3020/// \returns A 128-bit integer vector containing the right-shifted values.
3021static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3022 __m128i __count) {
3023 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3024}
3025
3026/// Right-shifts each of 64-bit values in the 128-bit integer vector
3027/// operand by the specified number of bits. High-order bits are cleared.
3028///
3029/// \headerfile <x86intrin.h>
3030///
3031/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3032///
3033/// \param __a
3034/// A 128-bit integer vector containing the source operand.
3035/// \param __count
3036/// An integer value specifying the number of bits to right-shift each value
3037/// in operand \a __a.
3038/// \returns A 128-bit integer vector containing the right-shifted values.
3039static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3040 int __count) {
3041 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3042}
3043
3044/// Right-shifts each of 64-bit values in the 128-bit integer vector
3045/// operand by the specified number of bits. High-order bits are cleared.
3046///
3047/// \headerfile <x86intrin.h>
3048///
3049/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3050///
3051/// \param __a
3052/// A 128-bit integer vector containing the source operand.
3053/// \param __count
3054/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3055/// to right-shift each value in operand \a __a.
3056/// \returns A 128-bit integer vector containing the right-shifted values.
3057static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3058 __m128i __count) {
3059 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3060}
3061
3062/// Compares each of the corresponding 8-bit values of the 128-bit
3063/// integer vectors for equality.
3064///
3065/// Each comparison returns 0x0 for false, 0xFF for true.
3066///
3067/// \headerfile <x86intrin.h>
3068///
3069/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3070///
3071/// \param __a
3072/// A 128-bit integer vector.
3073/// \param __b
3074/// A 128-bit integer vector.
3075/// \returns A 128-bit integer vector containing the comparison results.
3076static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3077 __m128i __b) {
3078 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3079}
3080
3081/// Compares each of the corresponding 16-bit values of the 128-bit
3082/// integer vectors for equality.
3083///
3084/// Each comparison returns 0x0 for false, 0xFFFF for true.
3085///
3086/// \headerfile <x86intrin.h>
3087///
3088/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3089///
3090/// \param __a
3091/// A 128-bit integer vector.
3092/// \param __b
3093/// A 128-bit integer vector.
3094/// \returns A 128-bit integer vector containing the comparison results.
3095static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3096 __m128i __b) {
3097 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3098}
3099
3100/// Compares each of the corresponding 32-bit values of the 128-bit
3101/// integer vectors for equality.
3102///
3103/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3104///
3105/// \headerfile <x86intrin.h>
3106///
3107/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3108///
3109/// \param __a
3110/// A 128-bit integer vector.
3111/// \param __b
3112/// A 128-bit integer vector.
3113/// \returns A 128-bit integer vector containing the comparison results.
3114static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3115 __m128i __b) {
3116 return (__m128i)((__v4si)__a == (__v4si)__b);
3117}
3118
3119/// Compares each of the corresponding signed 8-bit values of the 128-bit
3120/// integer vectors to determine if the values in the first operand are
3121/// greater than those in the second operand.
3122///
3123/// Each comparison returns 0x0 for false, 0xFF for true.
3124///
3125/// \headerfile <x86intrin.h>
3126///
3127/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3128///
3129/// \param __a
3130/// A 128-bit integer vector.
3131/// \param __b
3132/// A 128-bit integer vector.
3133/// \returns A 128-bit integer vector containing the comparison results.
3134static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3135 __m128i __b) {
3136 /* This function always performs a signed comparison, but __v16qi is a char
3137 which may be signed or unsigned, so use __v16qs. */
3138 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3139}
3140
3141/// Compares each of the corresponding signed 16-bit values of the
3142/// 128-bit integer vectors to determine if the values in the first operand
3143/// are greater than those in the second operand.
3144///
3145/// Each comparison returns 0x0 for false, 0xFFFF for true.
3146///
3147/// \headerfile <x86intrin.h>
3148///
3149/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3150///
3151/// \param __a
3152/// A 128-bit integer vector.
3153/// \param __b
3154/// A 128-bit integer vector.
3155/// \returns A 128-bit integer vector containing the comparison results.
3156static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3157 __m128i __b) {
3158 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3159}
3160
3161/// Compares each of the corresponding signed 32-bit values of the
3162/// 128-bit integer vectors to determine if the values in the first operand
3163/// are greater than those in the second operand.
3164///
3165/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3166///
3167/// \headerfile <x86intrin.h>
3168///
3169/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3170///
3171/// \param __a
3172/// A 128-bit integer vector.
3173/// \param __b
3174/// A 128-bit integer vector.
3175/// \returns A 128-bit integer vector containing the comparison results.
3176static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3177 __m128i __b) {
3178 return (__m128i)((__v4si)__a > (__v4si)__b);
3179}
3180
3181/// Compares each of the corresponding signed 8-bit values of the 128-bit
3182/// integer vectors to determine if the values in the first operand are less
3183/// than those in the second operand.
3184///
3185/// Each comparison returns 0x0 for false, 0xFF for true.
3186///
3187/// \headerfile <x86intrin.h>
3188///
3189/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3190///
3191/// \param __a
3192/// A 128-bit integer vector.
3193/// \param __b
3194/// A 128-bit integer vector.
3195/// \returns A 128-bit integer vector containing the comparison results.
3196static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3197 __m128i __b) {
3198 return _mm_cmpgt_epi8(__b, __a);
3199}
3200
3201/// Compares each of the corresponding signed 16-bit values of the
3202/// 128-bit integer vectors to determine if the values in the first operand
3203/// are less than those in the second operand.
3204///
3205/// Each comparison returns 0x0 for false, 0xFFFF for true.
3206///
3207/// \headerfile <x86intrin.h>
3208///
3209/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3210///
3211/// \param __a
3212/// A 128-bit integer vector.
3213/// \param __b
3214/// A 128-bit integer vector.
3215/// \returns A 128-bit integer vector containing the comparison results.
3216static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3217 __m128i __b) {
3218 return _mm_cmpgt_epi16(__b, __a);
3219}
3220
3221/// Compares each of the corresponding signed 32-bit values of the
3222/// 128-bit integer vectors to determine if the values in the first operand
3223/// are less than those in the second operand.
3224///
3225/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3226///
3227/// \headerfile <x86intrin.h>
3228///
3229/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3230///
3231/// \param __a
3232/// A 128-bit integer vector.
3233/// \param __b
3234/// A 128-bit integer vector.
3235/// \returns A 128-bit integer vector containing the comparison results.
3236static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3237 __m128i __b) {
3238 return _mm_cmpgt_epi32(__b, __a);
3239}
3240
3241#ifdef __x86_64__
3242/// Converts a 64-bit signed integer value from the second operand into a
3243/// double-precision value and returns it in the lower element of a [2 x
3244/// double] vector; the upper element of the returned vector is copied from
3245/// the upper element of the first operand.
3246///
3247/// \headerfile <x86intrin.h>
3248///
3249/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3250///
3251/// \param __a
3252/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3253/// copied to the upper 64 bits of the destination.
3254/// \param __b
3255/// A 64-bit signed integer operand containing the value to be converted.
3256/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3257/// converted value of the second operand. The upper 64 bits are copied from
3258/// the upper 64 bits of the first operand.
3259static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3260 long long __b) {
3261 __a[0] = __b;
3262 return __a;
3263}
3264
3265/// Converts the first (lower) element of a vector of [2 x double] into a
3266/// 64-bit signed integer value.
3267///
3268/// If the converted value does not fit in a 64-bit integer, raises a
3269/// floating-point invalid exception. If the exception is masked, returns
3270/// the most negative integer.
3271///
3272/// \headerfile <x86intrin.h>
3273///
3274/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3275///
3276/// \param __a
3277/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3278/// conversion.
3279/// \returns A 64-bit signed integer containing the converted value.
3280static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3281 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3282}
3283
3284/// Converts the first (lower) element of a vector of [2 x double] into a
3285/// 64-bit signed truncated (rounded toward zero) integer value.
3286///
3287/// If a converted value does not fit in a 64-bit integer, raises a
3288/// floating-point invalid exception. If the exception is masked, returns
3289/// the most negative integer.
3290///
3291/// \headerfile <x86intrin.h>
3292///
3293/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3294/// instruction.
3295///
3296/// \param __a
3297/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3298/// conversion.
3299/// \returns A 64-bit signed integer containing the converted value.
3300static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3301 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3302}
3303#endif
3304
3305/// Converts a vector of [4 x i32] into a vector of [4 x float].
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3310///
3311/// \param __a
3312/// A 128-bit integer vector.
3313/// \returns A 128-bit vector of [4 x float] containing the converted values.
3314static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3315 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3316}
3317
3318/// Converts a vector of [4 x float] into a vector of [4 x i32].
3319///
3320/// If a converted value does not fit in a 32-bit integer, raises a
3321/// floating-point invalid exception. If the exception is masked, returns
3322/// the most negative integer.
3323///
3324/// \headerfile <x86intrin.h>
3325///
3326/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3327///
3328/// \param __a
3329/// A 128-bit vector of [4 x float].
3330/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3331/// values.
3332static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3333 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3334}
3335
3336/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3337/// zero) 32-bit integers, returned in a vector of [4 x i32].
3338///
3339/// If a converted value does not fit in a 32-bit integer, raises a
3340/// floating-point invalid exception. If the exception is masked, returns
3341/// the most negative integer.
3342///
3343/// \headerfile <x86intrin.h>
3344///
3345/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3346/// instruction.
3347///
3348/// \param __a
3349/// A 128-bit vector of [4 x float].
3350/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3351static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3352 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3353}
3354
3355/// Returns a vector of [4 x i32] where the lowest element is the input
3356/// operand and the remaining elements are zero.
3357///
3358/// \headerfile <x86intrin.h>
3359///
3360/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3361///
3362/// \param __a
3363/// A 32-bit signed integer operand.
3364/// \returns A 128-bit vector of [4 x i32].
3365static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3366 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3367}
3368
3369/// Returns a vector of [2 x i64] where the lower element is the input
3370/// operand and the upper element is zero.
3371///
3372/// \headerfile <x86intrin.h>
3373///
3374/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3375/// in 64-bit mode.
3376///
3377/// \param __a
3378/// A 64-bit signed integer operand containing the value to be converted.
3379/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3380static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3381 return __extension__(__m128i)(__v2di){__a, 0};
3382}
3383
3384/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3385/// 32-bit signed integer value.
3386///
3387/// \headerfile <x86intrin.h>
3388///
3389/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3390///
3391/// \param __a
3392/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3393/// destination.
3394/// \returns A 32-bit signed integer containing the moved value.
3395static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3396 __v4si __b = (__v4si)__a;
3397 return __b[0];
3398}
3399
3400/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3401/// 64-bit signed integer value.
3402///
3403/// \headerfile <x86intrin.h>
3404///
3405/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3406///
3407/// \param __a
3408/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3409/// destination.
3410/// \returns A 64-bit signed integer containing the moved value.
3411static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3412 return __a[0];
3413}
3414
3415/// Moves packed integer values from an aligned 128-bit memory location
3416/// to elements in a 128-bit integer vector.
3417///
3418/// \headerfile <x86intrin.h>
3419///
3420/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3421///
3422/// \param __p
3423/// An aligned pointer to a memory location containing integer values.
3424/// \returns A 128-bit integer vector containing the moved values.
3425static __inline__ __m128i __DEFAULT_FN_ATTRS
3426_mm_load_si128(__m128i const *__p) {
3427 return *__p;
3428}
3429
3430/// Moves packed integer values from an unaligned 128-bit memory location
3431/// to elements in a 128-bit integer vector.
3432///
3433/// \headerfile <x86intrin.h>
3434///
3435/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3436///
3437/// \param __p
3438/// A pointer to a memory location containing integer values.
3439/// \returns A 128-bit integer vector containing the moved values.
3440static __inline__ __m128i __DEFAULT_FN_ATTRS
3441_mm_loadu_si128(__m128i_u const *__p) {
3442 struct __loadu_si128 {
3443 __m128i_u __v;
3444 } __attribute__((__packed__, __may_alias__));
3445 return ((const struct __loadu_si128 *)__p)->__v;
3446}
3447
3448/// Returns a vector of [2 x i64] where the lower element is taken from
3449/// the lower element of the operand, and the upper element is zero.
3450///
3451/// \headerfile <x86intrin.h>
3452///
3453/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3454///
3455/// \param __p
3456/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3457/// the destination.
3458/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3459/// moved value. The higher order bits are cleared.
3460static __inline__ __m128i __DEFAULT_FN_ATTRS
3461_mm_loadl_epi64(__m128i_u const *__p) {
3462 struct __mm_loadl_epi64_struct {
3463 long long __u;
3464 } __attribute__((__packed__, __may_alias__));
3465 return __extension__(__m128i){
3466 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3467}
3468
3469/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3470/// This could be used as an argument to another intrinsic function where the
3471/// argument is required but the value is not actually used.
3472///
3473/// \headerfile <x86intrin.h>
3474///
3475/// This intrinsic has no corresponding instruction.
3476///
3477/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3478static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3479 return (__m128i)__builtin_ia32_undef128();
3480}
3481
3482/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3483/// the specified 64-bit integer values.
3484///
3485/// \headerfile <x86intrin.h>
3486///
3487/// This intrinsic is a utility function and does not correspond to a specific
3488/// instruction.
3489///
3490/// \param __q1
3491/// A 64-bit integer value used to initialize the upper 64 bits of the
3492/// destination vector of [2 x i64].
3493/// \param __q0
3494/// A 64-bit integer value used to initialize the lower 64 bits of the
3495/// destination vector of [2 x i64].
3496/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3497/// provided in the operands.
3498static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3499 long long __q0) {
3500 return __extension__(__m128i)(__v2di){__q0, __q1};
3501}
3502
3503/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3504/// the specified 64-bit integer values.
3505///
3506/// \headerfile <x86intrin.h>
3507///
3508/// This intrinsic is a utility function and does not correspond to a specific
3509/// instruction.
3510///
3511/// \param __q1
3512/// A 64-bit integer value used to initialize the upper 64 bits of the
3513/// destination vector of [2 x i64].
3514/// \param __q0
3515/// A 64-bit integer value used to initialize the lower 64 bits of the
3516/// destination vector of [2 x i64].
3517/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3518/// provided in the operands.
3519static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3520 __m64 __q0) {
3521 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3522}
3523
3524/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3525/// the specified 32-bit integer values.
3526///
3527/// \headerfile <x86intrin.h>
3528///
3529/// This intrinsic is a utility function and does not correspond to a specific
3530/// instruction.
3531///
3532/// \param __i3
3533/// A 32-bit integer value used to initialize bits [127:96] of the
3534/// destination vector.
3535/// \param __i2
3536/// A 32-bit integer value used to initialize bits [95:64] of the destination
3537/// vector.
3538/// \param __i1
3539/// A 32-bit integer value used to initialize bits [63:32] of the destination
3540/// vector.
3541/// \param __i0
3542/// A 32-bit integer value used to initialize bits [31:0] of the destination
3543/// vector.
3544/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3545/// provided in the operands.
3546static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3547 int __i1, int __i0) {
3548 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3549}
3550
3551/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3552/// the specified 16-bit integer values.
3553///
3554/// \headerfile <x86intrin.h>
3555///
3556/// This intrinsic is a utility function and does not correspond to a specific
3557/// instruction.
3558///
3559/// \param __w7
3560/// A 16-bit integer value used to initialize bits [127:112] of the
3561/// destination vector.
3562/// \param __w6
3563/// A 16-bit integer value used to initialize bits [111:96] of the
3564/// destination vector.
3565/// \param __w5
3566/// A 16-bit integer value used to initialize bits [95:80] of the destination
3567/// vector.
3568/// \param __w4
3569/// A 16-bit integer value used to initialize bits [79:64] of the destination
3570/// vector.
3571/// \param __w3
3572/// A 16-bit integer value used to initialize bits [63:48] of the destination
3573/// vector.
3574/// \param __w2
3575/// A 16-bit integer value used to initialize bits [47:32] of the destination
3576/// vector.
3577/// \param __w1
3578/// A 16-bit integer value used to initialize bits [31:16] of the destination
3579/// vector.
3580/// \param __w0
3581/// A 16-bit integer value used to initialize bits [15:0] of the destination
3582/// vector.
3583/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3584/// provided in the operands.
3585static __inline__ __m128i __DEFAULT_FN_ATTRS
3586_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3587 short __w2, short __w1, short __w0) {
3588 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3589 __w4, __w5, __w6, __w7};
3590}
3591
3592/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3593/// the specified 8-bit integer values.
3594///
3595/// \headerfile <x86intrin.h>
3596///
3597/// This intrinsic is a utility function and does not correspond to a specific
3598/// instruction.
3599///
3600/// \param __b15
3601/// Initializes bits [127:120] of the destination vector.
3602/// \param __b14
3603/// Initializes bits [119:112] of the destination vector.
3604/// \param __b13
3605/// Initializes bits [111:104] of the destination vector.
3606/// \param __b12
3607/// Initializes bits [103:96] of the destination vector.
3608/// \param __b11
3609/// Initializes bits [95:88] of the destination vector.
3610/// \param __b10
3611/// Initializes bits [87:80] of the destination vector.
3612/// \param __b9
3613/// Initializes bits [79:72] of the destination vector.
3614/// \param __b8
3615/// Initializes bits [71:64] of the destination vector.
3616/// \param __b7
3617/// Initializes bits [63:56] of the destination vector.
3618/// \param __b6
3619/// Initializes bits [55:48] of the destination vector.
3620/// \param __b5
3621/// Initializes bits [47:40] of the destination vector.
3622/// \param __b4
3623/// Initializes bits [39:32] of the destination vector.
3624/// \param __b3
3625/// Initializes bits [31:24] of the destination vector.
3626/// \param __b2
3627/// Initializes bits [23:16] of the destination vector.
3628/// \param __b1
3629/// Initializes bits [15:8] of the destination vector.
3630/// \param __b0
3631/// Initializes bits [7:0] of the destination vector.
3632/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3633/// provided in the operands.
3634static __inline__ __m128i __DEFAULT_FN_ATTRS
3635_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3636 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3637 char __b4, char __b3, char __b2, char __b1, char __b0) {
3638 return __extension__(__m128i)(__v16qi){
3639 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3640 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3641}
3642
3643/// Initializes both values in a 128-bit integer vector with the
3644/// specified 64-bit integer value.
3645///
3646/// \headerfile <x86intrin.h>
3647///
3648/// This intrinsic is a utility function and does not correspond to a specific
3649/// instruction.
3650///
3651/// \param __q
3652/// Integer value used to initialize the elements of the destination integer
3653/// vector.
3654/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3655/// elements containing the value provided in the operand.
3656static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3657 return _mm_set_epi64x(__q, __q);
3658}
3659
3660/// Initializes both values in a 128-bit vector of [2 x i64] with the
3661/// specified 64-bit value.
3662///
3663/// \headerfile <x86intrin.h>
3664///
3665/// This intrinsic is a utility function and does not correspond to a specific
3666/// instruction.
3667///
3668/// \param __q
3669/// A 64-bit value used to initialize the elements of the destination integer
3670/// vector.
3671/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3672/// containing the value provided in the operand.
3673static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3674 return _mm_set_epi64(__q, __q);
3675}
3676
3677/// Initializes all values in a 128-bit vector of [4 x i32] with the
3678/// specified 32-bit value.
3679///
3680/// \headerfile <x86intrin.h>
3681///
3682/// This intrinsic is a utility function and does not correspond to a specific
3683/// instruction.
3684///
3685/// \param __i
3686/// A 32-bit value used to initialize the elements of the destination integer
3687/// vector.
3688/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3689/// containing the value provided in the operand.
3690static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3691 return _mm_set_epi32(__i, __i, __i, __i);
3692}
3693
3694/// Initializes all values in a 128-bit vector of [8 x i16] with the
3695/// specified 16-bit value.
3696///
3697/// \headerfile <x86intrin.h>
3698///
3699/// This intrinsic is a utility function and does not correspond to a specific
3700/// instruction.
3701///
3702/// \param __w
3703/// A 16-bit value used to initialize the elements of the destination integer
3704/// vector.
3705/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3706/// containing the value provided in the operand.
3707static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3708 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3709}
3710
3711/// Initializes all values in a 128-bit vector of [16 x i8] with the
3712/// specified 8-bit value.
3713///
3714/// \headerfile <x86intrin.h>
3715///
3716/// This intrinsic is a utility function and does not correspond to a specific
3717/// instruction.
3718///
3719/// \param __b
3720/// An 8-bit value used to initialize the elements of the destination integer
3721/// vector.
3722/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3723/// containing the value provided in the operand.
3724static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3725 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3726 __b, __b, __b, __b, __b);
3727}
3728
3729/// Constructs a 128-bit integer vector, initialized in reverse order
3730/// with the specified 64-bit integral values.
3731///
3732/// \headerfile <x86intrin.h>
3733///
3734/// This intrinsic does not correspond to a specific instruction.
3735///
3736/// \param __q0
3737/// A 64-bit integral value used to initialize the lower 64 bits of the
3738/// result.
3739/// \param __q1
3740/// A 64-bit integral value used to initialize the upper 64 bits of the
3741/// result.
3742/// \returns An initialized 128-bit integer vector.
3743static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3744 __m64 __q1) {
3745 return _mm_set_epi64(__q1, __q0);
3746}
3747
3748/// Constructs a 128-bit integer vector, initialized in reverse order
3749/// with the specified 32-bit integral values.
3750///
3751/// \headerfile <x86intrin.h>
3752///
3753/// This intrinsic is a utility function and does not correspond to a specific
3754/// instruction.
3755///
3756/// \param __i0
3757/// A 32-bit integral value used to initialize bits [31:0] of the result.
3758/// \param __i1
3759/// A 32-bit integral value used to initialize bits [63:32] of the result.
3760/// \param __i2
3761/// A 32-bit integral value used to initialize bits [95:64] of the result.
3762/// \param __i3
3763/// A 32-bit integral value used to initialize bits [127:96] of the result.
3764/// \returns An initialized 128-bit integer vector.
3765static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3766 int __i2,
3767 int __i3) {
3768 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3769}
3770
3771/// Constructs a 128-bit integer vector, initialized in reverse order
3772/// with the specified 16-bit integral values.
3773///
3774/// \headerfile <x86intrin.h>
3775///
3776/// This intrinsic is a utility function and does not correspond to a specific
3777/// instruction.
3778///
3779/// \param __w0
3780/// A 16-bit integral value used to initialize bits [15:0] of the result.
3781/// \param __w1
3782/// A 16-bit integral value used to initialize bits [31:16] of the result.
3783/// \param __w2
3784/// A 16-bit integral value used to initialize bits [47:32] of the result.
3785/// \param __w3
3786/// A 16-bit integral value used to initialize bits [63:48] of the result.
3787/// \param __w4
3788/// A 16-bit integral value used to initialize bits [79:64] of the result.
3789/// \param __w5
3790/// A 16-bit integral value used to initialize bits [95:80] of the result.
3791/// \param __w6
3792/// A 16-bit integral value used to initialize bits [111:96] of the result.
3793/// \param __w7
3794/// A 16-bit integral value used to initialize bits [127:112] of the result.
3795/// \returns An initialized 128-bit integer vector.
3796static __inline__ __m128i __DEFAULT_FN_ATTRS
3797_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3798 short __w5, short __w6, short __w7) {
3799 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3800}
3801
3802/// Constructs a 128-bit integer vector, initialized in reverse order
3803/// with the specified 8-bit integral values.
3804///
3805/// \headerfile <x86intrin.h>
3806///
3807/// This intrinsic is a utility function and does not correspond to a specific
3808/// instruction.
3809///
3810/// \param __b0
3811/// An 8-bit integral value used to initialize bits [7:0] of the result.
3812/// \param __b1
3813/// An 8-bit integral value used to initialize bits [15:8] of the result.
3814/// \param __b2
3815/// An 8-bit integral value used to initialize bits [23:16] of the result.
3816/// \param __b3
3817/// An 8-bit integral value used to initialize bits [31:24] of the result.
3818/// \param __b4
3819/// An 8-bit integral value used to initialize bits [39:32] of the result.
3820/// \param __b5
3821/// An 8-bit integral value used to initialize bits [47:40] of the result.
3822/// \param __b6
3823/// An 8-bit integral value used to initialize bits [55:48] of the result.
3824/// \param __b7
3825/// An 8-bit integral value used to initialize bits [63:56] of the result.
3826/// \param __b8
3827/// An 8-bit integral value used to initialize bits [71:64] of the result.
3828/// \param __b9
3829/// An 8-bit integral value used to initialize bits [79:72] of the result.
3830/// \param __b10
3831/// An 8-bit integral value used to initialize bits [87:80] of the result.
3832/// \param __b11
3833/// An 8-bit integral value used to initialize bits [95:88] of the result.
3834/// \param __b12
3835/// An 8-bit integral value used to initialize bits [103:96] of the result.
3836/// \param __b13
3837/// An 8-bit integral value used to initialize bits [111:104] of the result.
3838/// \param __b14
3839/// An 8-bit integral value used to initialize bits [119:112] of the result.
3840/// \param __b15
3841/// An 8-bit integral value used to initialize bits [127:120] of the result.
3842/// \returns An initialized 128-bit integer vector.
3843static __inline__ __m128i __DEFAULT_FN_ATTRS
3844_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3845 char __b6, char __b7, char __b8, char __b9, char __b10,
3846 char __b11, char __b12, char __b13, char __b14, char __b15) {
3847 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3848 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3849}
3850
3851/// Creates a 128-bit integer vector initialized to zero.
3852///
3853/// \headerfile <x86intrin.h>
3854///
3855/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3856///
3857/// \returns An initialized 128-bit integer vector with all elements set to
3858/// zero.
3859static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3860 return __extension__(__m128i)(__v2di){0LL, 0LL};
3861}
3862
3863/// Stores a 128-bit integer vector to a memory location aligned on a
3864/// 128-bit boundary.
3865///
3866/// \headerfile <x86intrin.h>
3867///
3868/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3869///
3870/// \param __p
3871/// A pointer to an aligned memory location that will receive the integer
3872/// values.
3873/// \param __b
3874/// A 128-bit integer vector containing the values to be moved.
3875static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3876 __m128i __b) {
3877 *__p = __b;
3878}
3879
3880/// Stores a 128-bit integer vector to an unaligned memory location.
3881///
3882/// \headerfile <x86intrin.h>
3883///
3884/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3885///
3886/// \param __p
3887/// A pointer to a memory location that will receive the integer values.
3888/// \param __b
3889/// A 128-bit integer vector containing the values to be moved.
3890static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3891 __m128i __b) {
3892 struct __storeu_si128 {
3893 __m128i_u __v;
3894 } __attribute__((__packed__, __may_alias__));
3895 ((struct __storeu_si128 *)__p)->__v = __b;
3896}
3897
3898/// Stores a 64-bit integer value from the low element of a 128-bit integer
3899/// vector.
3900///
3901/// \headerfile <x86intrin.h>
3902///
3903/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3904///
3905/// \param __p
3906/// A pointer to a 64-bit memory location. The address of the memory
3907/// location does not have to be aligned.
3908/// \param __b
3909/// A 128-bit integer vector containing the value to be stored.
3910static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3911 __m128i __b) {
3912 struct __storeu_si64 {
3913 long long __v;
3914 } __attribute__((__packed__, __may_alias__));
3915 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3916}
3917
3918/// Stores a 32-bit integer value from the low element of a 128-bit integer
3919/// vector.
3920///
3921/// \headerfile <x86intrin.h>
3922///
3923/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3924///
3925/// \param __p
3926/// A pointer to a 32-bit memory location. The address of the memory
3927/// location does not have to be aligned.
3928/// \param __b
3929/// A 128-bit integer vector containing the value to be stored.
3930static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3931 __m128i __b) {
3932 struct __storeu_si32 {
3933 int __v;
3934 } __attribute__((__packed__, __may_alias__));
3935 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3936}
3937
3938/// Stores a 16-bit integer value from the low element of a 128-bit integer
3939/// vector.
3940///
3941/// \headerfile <x86intrin.h>
3942///
3943/// This intrinsic does not correspond to a specific instruction.
3944///
3945/// \param __p
3946/// A pointer to a 16-bit memory location. The address of the memory
3947/// location does not have to be aligned.
3948/// \param __b
3949/// A 128-bit integer vector containing the value to be stored.
3950static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3951 __m128i __b) {
3952 struct __storeu_si16 {
3953 short __v;
3954 } __attribute__((__packed__, __may_alias__));
3955 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3956}
3957
3958/// Moves bytes selected by the mask from the first operand to the
3959/// specified unaligned memory location. When a mask bit is 1, the
3960/// corresponding byte is written, otherwise it is not written.
3961///
3962/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3963/// used again soon). Exception and trap behavior for elements not selected
3964/// for storage to memory are implementation dependent.
3965///
3966/// \headerfile <x86intrin.h>
3967///
3968/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3969/// instruction.
3970///
3971/// \param __d
3972/// A 128-bit integer vector containing the values to be moved.
3973/// \param __n
3974/// A 128-bit integer vector containing the mask. The most significant bit of
3975/// each byte represents the mask bits.
3976/// \param __p
3977/// A pointer to an unaligned 128-bit memory location where the specified
3978/// values are moved.
3979static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3980 __m128i __n,
3981 char *__p) {
3982 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3983}
3984
3985/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3986/// a memory location.
3987///
3988/// \headerfile <x86intrin.h>
3989///
3990/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3991///
3992/// \param __p
3993/// A pointer to a 64-bit memory location that will receive the lower 64 bits
3994/// of the integer vector parameter.
3995/// \param __a
3996/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3997/// value to be stored.
3998static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3999 __m128i __a) {
4000 struct __mm_storel_epi64_struct {
4001 long long __u;
4002 } __attribute__((__packed__, __may_alias__));
4003 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4004}
4005
4006/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4007/// aligned memory location.
4008///
4009/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4010/// used again soon).
4011///
4012/// \headerfile <x86intrin.h>
4013///
4014/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4015///
4016/// \param __p
4017/// A pointer to the 128-bit aligned memory location used to store the value.
4018/// \param __a
4019/// A vector of [2 x double] containing the 64-bit values to be stored.
4020static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4021 __m128d __a) {
4022 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4023}
4024
4025/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4026///
4027/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4028/// used again soon).
4029///
4030/// \headerfile <x86intrin.h>
4031///
4032/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4033///
4034/// \param __p
4035/// A pointer to the 128-bit aligned memory location used to store the value.
4036/// \param __a
4037/// A 128-bit integer vector containing the values to be stored.
4038static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4039 __m128i __a) {
4040 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4041}
4042
4043/// Stores a 32-bit integer value in the specified memory location.
4044///
4045/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4046/// used again soon).
4047///
4048/// \headerfile <x86intrin.h>
4049///
4050/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4051///
4052/// \param __p
4053/// A pointer to the 32-bit memory location used to store the value.
4054/// \param __a
4055/// A 32-bit integer containing the value to be stored.
4056static __inline__ void
4057 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4058 _mm_stream_si32(void *__p, int __a) {
4059 __builtin_ia32_movnti((int *)__p, __a);
4060}
4061
4062#ifdef __x86_64__
4063/// Stores a 64-bit integer value in the specified memory location.
4064///
4065/// To minimize caching, the data is flagged as non-temporal (unlikely to be
4066/// used again soon).
4067///
4068/// \headerfile <x86intrin.h>
4069///
4070/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4071///
4072/// \param __p
4073/// A pointer to the 64-bit memory location used to store the value.
4074/// \param __a
4075/// A 64-bit integer containing the value to be stored.
4076static __inline__ void
4077 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4078 _mm_stream_si64(void *__p, long long __a) {
4079 __builtin_ia32_movnti64((long long *)__p, __a);
4080}
4081#endif
4082
4083#if defined(__cplusplus)
4084extern "C" {
4085#endif
4086
4087/// The cache line containing \a __p is flushed and invalidated from all
4088/// caches in the coherency domain.
4089///
4090/// \headerfile <x86intrin.h>
4091///
4092/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4093///
4094/// \param __p
4095/// A pointer to the memory location used to identify the cache line to be
4096/// flushed.
4097void _mm_clflush(void const *__p);
4098
4099/// Forces strong memory ordering (serialization) between load
4100/// instructions preceding this instruction and load instructions following
4101/// this instruction, ensuring the system completes all previous loads before
4102/// executing subsequent loads.
4103///
4104/// \headerfile <x86intrin.h>
4105///
4106/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4107///
4108void _mm_lfence(void);
4109
4110/// Forces strong memory ordering (serialization) between load and store
4111/// instructions preceding this instruction and load and store instructions
4112/// following this instruction, ensuring that the system completes all
4113/// previous memory accesses before executing subsequent memory accesses.
4114///
4115/// \headerfile <x86intrin.h>
4116///
4117/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4118///
4119void _mm_mfence(void);
4120
4121#if defined(__cplusplus)
4122} // extern "C"
4123#endif
4124
4125/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4126/// vector operands into 8-bit signed integers, and packs the results into
4127/// the destination.
4128///
4129/// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4130/// less than 0x80 are saturated to 0x80.
4131///
4132/// \headerfile <x86intrin.h>
4133///
4134/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4135///
4136/// \param __a
4137/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4138/// written to the lower 64 bits of the result.
4139/// \param __b
4140/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4141/// written to the higher 64 bits of the result.
4142/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4143static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4144 __m128i __b) {
4145 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4146}
4147
4148/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4149/// vector operands into 16-bit signed integers, and packs the results into
4150/// the destination.
4151///
4152/// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4153/// values less than 0x8000 are saturated to 0x8000.
4154///
4155/// \headerfile <x86intrin.h>
4156///
4157/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4158///
4159/// \param __a
4160/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4161/// are written to the lower 64 bits of the result.
4162/// \param __b
4163/// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4164/// are written to the higher 64 bits of the result.
4165/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4166static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4167 __m128i __b) {
4168 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4169}
4170
4171/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4172/// vector operands into 8-bit unsigned integers, and packs the results into
4173/// the destination.
4174///
4175/// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4176/// are saturated to 0x00.
4177///
4178/// \headerfile <x86intrin.h>
4179///
4180/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4181///
4182/// \param __a
4183/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4184/// written to the lower 64 bits of the result.
4185/// \param __b
4186/// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4187/// written to the higher 64 bits of the result.
4188/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4189static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4190 __m128i __b) {
4191 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4192}
4193
4194/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4195/// the immediate-value parameter as a selector.
4196///
4197/// \headerfile <x86intrin.h>
4198///
4199/// \code
4200/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4201/// \endcode
4202///
4203/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4204///
4205/// \param a
4206/// A 128-bit integer vector.
4207/// \param imm
4208/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4209/// to bits[15:0] of the result. \n
4210/// 000: assign values from bits [15:0] of \a a. \n
4211/// 001: assign values from bits [31:16] of \a a. \n
4212/// 010: assign values from bits [47:32] of \a a. \n
4213/// 011: assign values from bits [63:48] of \a a. \n
4214/// 100: assign values from bits [79:64] of \a a. \n
4215/// 101: assign values from bits [95:80] of \a a. \n
4216/// 110: assign values from bits [111:96] of \a a. \n
4217/// 111: assign values from bits [127:112] of \a a.
4218/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4219/// integer vector parameter and the remaining bits are assigned zeros.
4220#define _mm_extract_epi16(a, imm) \
4221 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4222 (int)(imm)))
4223
4224/// Constructs a 128-bit integer vector by first making a copy of the
4225/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4226/// of an integer parameter into an offset specified by the immediate-value
4227/// parameter.
4228///
4229/// \headerfile <x86intrin.h>
4230///
4231/// \code
4232/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4233/// \endcode
4234///
4235/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4236///
4237/// \param a
4238/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4239/// result and then one of the eight elements in the result is replaced by
4240/// the lower 16 bits of \a b.
4241/// \param b
4242/// An integer. The lower 16 bits of this parameter are written to the
4243/// result beginning at an offset specified by \a imm.
4244/// \param imm
4245/// An immediate value specifying the bit offset in the result at which the
4246/// lower 16 bits of \a b are written.
4247/// \returns A 128-bit integer vector containing the constructed values.
4248#define _mm_insert_epi16(a, b, imm) \
4249 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4250 (int)(imm)))
4251
4252/// Copies the values of the most significant bits from each 8-bit
4253/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4254/// value, zero-extends the value, and writes it to the destination.
4255///
4256/// \headerfile <x86intrin.h>
4257///
4258/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4259///
4260/// \param __a
4261/// A 128-bit integer vector containing the values with bits to be extracted.
4262/// \returns The most significant bits from each 8-bit element in \a __a,
4263/// written to bits [15:0]. The other bits are assigned zeros.
4264static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4265 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4266}
4267
4268/// Constructs a 128-bit integer vector by shuffling four 32-bit
4269/// elements of a 128-bit integer vector parameter, using the immediate-value
4270/// parameter as a specifier.
4271///
4272/// \headerfile <x86intrin.h>
4273///
4274/// \code
4275/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4276/// \endcode
4277///
4278/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4279///
4280/// \param a
4281/// A 128-bit integer vector containing the values to be copied.
4282/// \param imm
4283/// An immediate value containing an 8-bit value specifying which elements to
4284/// copy from a. The destinations within the 128-bit destination are assigned
4285/// values as follows: \n
4286/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4287/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4288/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4289/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4290/// Bit value assignments: \n
4291/// 00: assign values from bits [31:0] of \a a. \n
4292/// 01: assign values from bits [63:32] of \a a. \n
4293/// 10: assign values from bits [95:64] of \a a. \n
4294/// 11: assign values from bits [127:96] of \a a. \n
4295/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4296/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4297/// <c>[b6, b4, b2, b0]</c>.
4298/// \returns A 128-bit integer vector containing the shuffled values.
4299#define _mm_shuffle_epi32(a, imm) \
4300 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4301
4302/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4303/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4304/// value parameter as a specifier.
4305///
4306/// \headerfile <x86intrin.h>
4307///
4308/// \code
4309/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4310/// \endcode
4311///
4312/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4313///
4314/// \param a
4315/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4316/// [127:64] of the result.
4317/// \param imm
4318/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4319/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4320/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4321/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4322/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4323/// Bit value assignments: \n
4324/// 00: assign values from bits [15:0] of \a a. \n
4325/// 01: assign values from bits [31:16] of \a a. \n
4326/// 10: assign values from bits [47:32] of \a a. \n
4327/// 11: assign values from bits [63:48] of \a a. \n
4328/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4329/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4330/// <c>[b6, b4, b2, b0]</c>.
4331/// \returns A 128-bit integer vector containing the shuffled values.
4332#define _mm_shufflelo_epi16(a, imm) \
4333 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4334
4335/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4336/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4337/// value parameter as a specifier.
4338///
4339/// \headerfile <x86intrin.h>
4340///
4341/// \code
4342/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4343/// \endcode
4344///
4345/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4346///
4347/// \param a
4348/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4349/// [63:0] of the result.
4350/// \param imm
4351/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4352/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4353/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4354/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4355/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4356/// Bit value assignments: \n
4357/// 00: assign values from bits [79:64] of \a a. \n
4358/// 01: assign values from bits [95:80] of \a a. \n
4359/// 10: assign values from bits [111:96] of \a a. \n
4360/// 11: assign values from bits [127:112] of \a a. \n
4361/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4362/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4363/// <c>[b6, b4, b2, b0]</c>.
4364/// \returns A 128-bit integer vector containing the shuffled values.
4365#define _mm_shufflehi_epi16(a, imm) \
4366 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4367
4368/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4369/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4370///
4371/// \headerfile <x86intrin.h>
4372///
4373/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4374/// instruction.
4375///
4376/// \param __a
4377/// A 128-bit vector of [16 x i8].
4378/// Bits [71:64] are written to bits [7:0] of the result. \n
4379/// Bits [79:72] are written to bits [23:16] of the result. \n
4380/// Bits [87:80] are written to bits [39:32] of the result. \n
4381/// Bits [95:88] are written to bits [55:48] of the result. \n
4382/// Bits [103:96] are written to bits [71:64] of the result. \n
4383/// Bits [111:104] are written to bits [87:80] of the result. \n
4384/// Bits [119:112] are written to bits [103:96] of the result. \n
4385/// Bits [127:120] are written to bits [119:112] of the result.
4386/// \param __b
4387/// A 128-bit vector of [16 x i8]. \n
4388/// Bits [71:64] are written to bits [15:8] of the result. \n
4389/// Bits [79:72] are written to bits [31:24] of the result. \n
4390/// Bits [87:80] are written to bits [47:40] of the result. \n
4391/// Bits [95:88] are written to bits [63:56] of the result. \n
4392/// Bits [103:96] are written to bits [79:72] of the result. \n
4393/// Bits [111:104] are written to bits [95:88] of the result. \n
4394/// Bits [119:112] are written to bits [111:104] of the result. \n
4395/// Bits [127:120] are written to bits [127:120] of the result.
4396/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4397static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4398 __m128i __b) {
4399 return (__m128i)__builtin_shufflevector(
4400 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4401 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4402}
4403
4404/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4405/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4406///
4407/// \headerfile <x86intrin.h>
4408///
4409/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4410/// instruction.
4411///
4412/// \param __a
4413/// A 128-bit vector of [8 x i16].
4414/// Bits [79:64] are written to bits [15:0] of the result. \n
4415/// Bits [95:80] are written to bits [47:32] of the result. \n
4416/// Bits [111:96] are written to bits [79:64] of the result. \n
4417/// Bits [127:112] are written to bits [111:96] of the result.
4418/// \param __b
4419/// A 128-bit vector of [8 x i16].
4420/// Bits [79:64] are written to bits [31:16] of the result. \n
4421/// Bits [95:80] are written to bits [63:48] of the result. \n
4422/// Bits [111:96] are written to bits [95:80] of the result. \n
4423/// Bits [127:112] are written to bits [127:112] of the result.
4424/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4425static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4426 __m128i __b) {
4427 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4428 8 + 5, 6, 8 + 6, 7, 8 + 7);
4429}
4430
4431/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4432/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4433///
4434/// \headerfile <x86intrin.h>
4435///
4436/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4437/// instruction.
4438///
4439/// \param __a
4440/// A 128-bit vector of [4 x i32]. \n
4441/// Bits [95:64] are written to bits [31:0] of the destination. \n
4442/// Bits [127:96] are written to bits [95:64] of the destination.
4443/// \param __b
4444/// A 128-bit vector of [4 x i32]. \n
4445/// Bits [95:64] are written to bits [64:32] of the destination. \n
4446/// Bits [127:96] are written to bits [127:96] of the destination.
4447/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4448static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4449 __m128i __b) {
4450 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4451 4 + 3);
4452}
4453
4454/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4455/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4456///
4457/// \headerfile <x86intrin.h>
4458///
4459/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4460/// instruction.
4461///
4462/// \param __a
4463/// A 128-bit vector of [2 x i64]. \n
4464/// Bits [127:64] are written to bits [63:0] of the destination.
4465/// \param __b
4466/// A 128-bit vector of [2 x i64]. \n
4467/// Bits [127:64] are written to bits [127:64] of the destination.
4468/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4469static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4470 __m128i __b) {
4471 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4472}
4473
4474/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4475/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4476///
4477/// \headerfile <x86intrin.h>
4478///
4479/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4480/// instruction.
4481///
4482/// \param __a
4483/// A 128-bit vector of [16 x i8]. \n
4484/// Bits [7:0] are written to bits [7:0] of the result. \n
4485/// Bits [15:8] are written to bits [23:16] of the result. \n
4486/// Bits [23:16] are written to bits [39:32] of the result. \n
4487/// Bits [31:24] are written to bits [55:48] of the result. \n
4488/// Bits [39:32] are written to bits [71:64] of the result. \n
4489/// Bits [47:40] are written to bits [87:80] of the result. \n
4490/// Bits [55:48] are written to bits [103:96] of the result. \n
4491/// Bits [63:56] are written to bits [119:112] of the result.
4492/// \param __b
4493/// A 128-bit vector of [16 x i8].
4494/// Bits [7:0] are written to bits [15:8] of the result. \n
4495/// Bits [15:8] are written to bits [31:24] of the result. \n
4496/// Bits [23:16] are written to bits [47:40] of the result. \n
4497/// Bits [31:24] are written to bits [63:56] of the result. \n
4498/// Bits [39:32] are written to bits [79:72] of the result. \n
4499/// Bits [47:40] are written to bits [95:88] of the result. \n
4500/// Bits [55:48] are written to bits [111:104] of the result. \n
4501/// Bits [63:56] are written to bits [127:120] of the result.
4502/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4503static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4504 __m128i __b) {
4505 return (__m128i)__builtin_shufflevector(
4506 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4507 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4508}
4509
4510/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4511/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4512/// [8 x i16].
4513///
4514/// \headerfile <x86intrin.h>
4515///
4516/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4517/// instruction.
4518///
4519/// \param __a
4520/// A 128-bit vector of [8 x i16].
4521/// Bits [15:0] are written to bits [15:0] of the result. \n
4522/// Bits [31:16] are written to bits [47:32] of the result. \n
4523/// Bits [47:32] are written to bits [79:64] of the result. \n
4524/// Bits [63:48] are written to bits [111:96] of the result.
4525/// \param __b
4526/// A 128-bit vector of [8 x i16].
4527/// Bits [15:0] are written to bits [31:16] of the result. \n
4528/// Bits [31:16] are written to bits [63:48] of the result. \n
4529/// Bits [47:32] are written to bits [95:80] of the result. \n
4530/// Bits [63:48] are written to bits [127:112] of the result.
4531/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4532static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4533 __m128i __b) {
4534 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4535 8 + 1, 2, 8 + 2, 3, 8 + 3);
4536}
4537
4538/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4539/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4540///
4541/// \headerfile <x86intrin.h>
4542///
4543/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4544/// instruction.
4545///
4546/// \param __a
4547/// A 128-bit vector of [4 x i32]. \n
4548/// Bits [31:0] are written to bits [31:0] of the destination. \n
4549/// Bits [63:32] are written to bits [95:64] of the destination.
4550/// \param __b
4551/// A 128-bit vector of [4 x i32]. \n
4552/// Bits [31:0] are written to bits [64:32] of the destination. \n
4553/// Bits [63:32] are written to bits [127:96] of the destination.
4554/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4555static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4556 __m128i __b) {
4557 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4558 4 + 1);
4559}
4560
4561/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4562/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4563///
4564/// \headerfile <x86intrin.h>
4565///
4566/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4567/// instruction.
4568///
4569/// \param __a
4570/// A 128-bit vector of [2 x i64]. \n
4571/// Bits [63:0] are written to bits [63:0] of the destination. \n
4572/// \param __b
4573/// A 128-bit vector of [2 x i64]. \n
4574/// Bits [63:0] are written to bits [127:64] of the destination. \n
4575/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4576static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4577 __m128i __b) {
4578 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4579}
4580
4581/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4582/// integer.
4583///
4584/// \headerfile <x86intrin.h>
4585///
4586/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4587///
4588/// \param __a
4589/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4590/// destination.
4591/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4592static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4593 return (__m64)__a[0];
4594}
4595
4596/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4597/// upper bits.
4598///
4599/// \headerfile <x86intrin.h>
4600///
4601/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4602///
4603/// \param __a
4604/// A 64-bit value.
4605/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4606/// the operand. The upper 64 bits are assigned zeros.
4607static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4608 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4609}
4610
4611/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4612/// integer vector, zeroing the upper bits.
4613///
4614/// \headerfile <x86intrin.h>
4615///
4616/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4617///
4618/// \param __a
4619/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4620/// destination.
4621/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4622/// the operand. The upper 64 bits are assigned zeros.
4623static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4624 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4625}
4626
4627/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4628/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4629/// double].
4630///
4631/// \headerfile <x86intrin.h>
4632///
4633/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4634///
4635/// \param __a
4636/// A 128-bit vector of [2 x double]. \n
4637/// Bits [127:64] are written to bits [63:0] of the destination.
4638/// \param __b
4639/// A 128-bit vector of [2 x double]. \n
4640/// Bits [127:64] are written to bits [127:64] of the destination.
4641/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4642static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4643 __m128d __b) {
4644 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4645}
4646
4647/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4648/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4649/// double].
4650///
4651/// \headerfile <x86intrin.h>
4652///
4653/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4654///
4655/// \param __a
4656/// A 128-bit vector of [2 x double]. \n
4657/// Bits [63:0] are written to bits [63:0] of the destination.
4658/// \param __b
4659/// A 128-bit vector of [2 x double]. \n
4660/// Bits [63:0] are written to bits [127:64] of the destination.
4661/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4662static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4663 __m128d __b) {
4664 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4665}
4666
4667/// Extracts the sign bits of the double-precision values in the 128-bit
4668/// vector of [2 x double], zero-extends the value, and writes it to the
4669/// low-order bits of the destination.
4670///
4671/// \headerfile <x86intrin.h>
4672///
4673/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4674///
4675/// \param __a
4676/// A 128-bit vector of [2 x double] containing the values with sign bits to
4677/// be extracted.
4678/// \returns The sign bits from each of the double-precision elements in \a __a,
4679/// written to bits [1:0]. The remaining bits are assigned values of zero.
4680static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4681 return __builtin_ia32_movmskpd((__v2df)__a);
4682}
4683
4684/// Constructs a 128-bit floating-point vector of [2 x double] from two
4685/// 128-bit vector parameters of [2 x double], using the immediate-value
4686/// parameter as a specifier.
4687///
4688/// \headerfile <x86intrin.h>
4689///
4690/// \code
4691/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4692/// \endcode
4693///
4694/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4695///
4696/// \param a
4697/// A 128-bit vector of [2 x double].
4698/// \param b
4699/// A 128-bit vector of [2 x double].
4700/// \param i
4701/// An 8-bit immediate value. The least significant two bits specify which
4702/// elements to copy from \a a and \a b: \n
4703/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4704/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4705/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4706/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4707/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4708/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4709/// <c>[b1, b0]</c>.
4710/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4711#define _mm_shuffle_pd(a, b, i) \
4712 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4713 (int)(i)))
4714
4715/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4716/// floating-point vector of [4 x float].
4717///
4718/// \headerfile <x86intrin.h>
4719///
4720/// This intrinsic has no corresponding instruction.
4721///
4722/// \param __a
4723/// A 128-bit floating-point vector of [2 x double].
4724/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725/// bitwise pattern as the parameter.
4726static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4727 return (__m128)__a;
4728}
4729
4730/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4731/// integer vector.
4732///
4733/// \headerfile <x86intrin.h>
4734///
4735/// This intrinsic has no corresponding instruction.
4736///
4737/// \param __a
4738/// A 128-bit floating-point vector of [2 x double].
4739/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4740/// parameter.
4741static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4742 return (__m128i)__a;
4743}
4744
4745/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4746/// floating-point vector of [2 x double].
4747///
4748/// \headerfile <x86intrin.h>
4749///
4750/// This intrinsic has no corresponding instruction.
4751///
4752/// \param __a
4753/// A 128-bit floating-point vector of [4 x float].
4754/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4755/// bitwise pattern as the parameter.
4756static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4757 return (__m128d)__a;
4758}
4759
4760/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4761/// integer vector.
4762///
4763/// \headerfile <x86intrin.h>
4764///
4765/// This intrinsic has no corresponding instruction.
4766///
4767/// \param __a
4768/// A 128-bit floating-point vector of [4 x float].
4769/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4770/// parameter.
4771static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4772 return (__m128i)__a;
4773}
4774
4775/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4776/// of [4 x float].
4777///
4778/// \headerfile <x86intrin.h>
4779///
4780/// This intrinsic has no corresponding instruction.
4781///
4782/// \param __a
4783/// A 128-bit integer vector.
4784/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4785/// bitwise pattern as the parameter.
4786static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4787 return (__m128)__a;
4788}
4789
4790/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4791/// of [2 x double].
4792///
4793/// \headerfile <x86intrin.h>
4794///
4795/// This intrinsic has no corresponding instruction.
4796///
4797/// \param __a
4798/// A 128-bit integer vector.
4799/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4800/// bitwise pattern as the parameter.
4801static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4802 return (__m128d)__a;
4803}
4804
4805/// Compares each of the corresponding double-precision values of two
4806/// 128-bit vectors of [2 x double], using the operation specified by the
4807/// immediate integer operand.
4808///
4809/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4810/// If either value in a comparison is NaN, comparisons that are ordered
4811/// return false, and comparisons that are unordered return true.
4812///
4813/// \headerfile <x86intrin.h>
4814///
4815/// \code
4816/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4817/// \endcode
4818///
4819/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4820///
4821/// \param a
4822/// A 128-bit vector of [2 x double].
4823/// \param b
4824/// A 128-bit vector of [2 x double].
4825/// \param c
4826/// An immediate integer operand, with bits [4:0] specifying which comparison
4827/// operation to use: \n
4828/// 0x00: Equal (ordered, non-signaling) \n
4829/// 0x01: Less-than (ordered, signaling) \n
4830/// 0x02: Less-than-or-equal (ordered, signaling) \n
4831/// 0x03: Unordered (non-signaling) \n
4832/// 0x04: Not-equal (unordered, non-signaling) \n
4833/// 0x05: Not-less-than (unordered, signaling) \n
4834/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4835/// 0x07: Ordered (non-signaling) \n
4836/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4837#define _mm_cmp_pd(a, b, c) \
4838 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4839 (c)))
4840
4841/// Compares each of the corresponding scalar double-precision values of
4842/// two 128-bit vectors of [2 x double], using the operation specified by the
4843/// immediate integer operand.
4844///
4845/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4846/// If either value in a comparison is NaN, comparisons that are ordered
4847/// return false, and comparisons that are unordered return true.
4848///
4849/// \headerfile <x86intrin.h>
4850///
4851/// \code
4852/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4853/// \endcode
4854///
4855/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4856///
4857/// \param a
4858/// A 128-bit vector of [2 x double].
4859/// \param b
4860/// A 128-bit vector of [2 x double].
4861/// \param c
4862/// An immediate integer operand, with bits [4:0] specifying which comparison
4863/// operation to use: \n
4864/// 0x00: Equal (ordered, non-signaling) \n
4865/// 0x01: Less-than (ordered, signaling) \n
4866/// 0x02: Less-than-or-equal (ordered, signaling) \n
4867/// 0x03: Unordered (non-signaling) \n
4868/// 0x04: Not-equal (unordered, non-signaling) \n
4869/// 0x05: Not-less-than (unordered, signaling) \n
4870/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4871/// 0x07: Ordered (non-signaling) \n
4872/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4873#define _mm_cmp_sd(a, b, c) \
4874 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4875 (c)))
4876
4877#if defined(__cplusplus)
4878extern "C" {
4879#endif
4880
4881/// Indicates that a spin loop is being executed for the purposes of
4882/// optimizing power consumption during the loop.
4883///
4884/// \headerfile <x86intrin.h>
4885///
4886/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4887///
4888void _mm_pause(void);
4889
4890#if defined(__cplusplus)
4891} // extern "C"
4892#endif
4893
4894#undef __anyext128
4895#undef __trunc64
4896#undef __DEFAULT_FN_ATTRS
4897
4898#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4899
4900#define _MM_DENORMALS_ZERO_ON (0x0040U)
4901#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4902
4903#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4904
4905#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4906#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4907 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4908
4909#endif /* __EMMINTRIN_H */
__device__ _Float16
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3743
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1047
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4532
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4607
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1956
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3586
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1023
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1808
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1526
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4189
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2361
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:588
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:77
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:215
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:135
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4741
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:401
#define __anyext128(x)
Definition: emmintrin.h:58
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1649
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4038
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4264
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2812
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2663
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:823
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1189
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1612
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2560
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3411
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3979
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3546
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1213
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2156
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1556
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1318
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3076
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3003
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1792
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3216
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1826
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:745
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:196
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3998
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3236
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2698
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:522
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3950
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:301
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1687
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1668
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4642
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2436
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:770
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3134
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3021
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3095
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2418
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1141
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2867
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:418
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2323
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2260
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1936
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4801
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2985
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4503
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:796
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3156
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:976
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:52
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4662
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2582
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:720
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:672
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4576
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2905
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4680
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3114
static __inline__ void int __a
Definition: emmintrin.h:4058
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:156
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2646
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3930
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1492
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4623
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:480
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4469
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1473
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3314
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3380
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4555
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3176
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1429
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:256
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1879
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4425
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1340
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3478
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4771
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1237
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2222
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3365
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2544
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1630
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3039
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2758
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:609
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1382
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1359
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:117
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1738
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2200
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2178
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2604
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2492
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1758
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4448
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1279
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3844
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2380
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1095
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:567
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2134
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2399
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:384
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:999
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2304
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4726
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2848
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4143
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3690
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2886
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1596
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1071
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:651
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4756
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2012
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2794
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2475
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:950
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:850
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2031
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1917
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2097
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3441
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3724
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4166
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2776
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2055
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:925
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:501
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:695
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4020
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3196
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4786
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1860
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:364
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3498
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1404
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2454
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:900
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:875
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3351
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3797
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1712
#define __trunc64(x)
Definition: emmintrin.h:56
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3635
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3673
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:239
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1846
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3519
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:280
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2830
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:95
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3910
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1896
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1572
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:347
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2342
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4592
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3875
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:174
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:438
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1776
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:326
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1541
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3461
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3707
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3656
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3395
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2526
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2241
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1995
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4397
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1511
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2967
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1973
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1261
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2076
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2625
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1297
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3859
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1119
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:544
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:459
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3890
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2924
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3765
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1453
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3332
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2715
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:630
struct __storeu_i16 *__P __v
Definition: immintrin.h:472