clang  14.0.0git
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Type defines. */
26 typedef double __v2df __attribute__ ((__vector_size__ (16)));
27 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
28 typedef short __v8hi __attribute__((__vector_size__(16)));
29 typedef char __v16qi __attribute__((__vector_size__(16)));
30 
31 /* Unsigned types */
32 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
33 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
34 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
35 
36 /* We need an explicitly signed variant for char. Note that this shouldn't
37  * appear in the interface though. */
38 typedef signed char __v16qs __attribute__((__vector_size__(16)));
39 
40 /* Define the default attributes for the functions in this file. */
41 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
42 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
43 
44 /// Adds lower double-precision values in both operands and returns the
45 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
46 /// are copied from the upper double-precision value of the first operand.
47 ///
48 /// \headerfile <x86intrin.h>
49 ///
50 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
51 ///
52 /// \param __a
53 /// A 128-bit vector of [2 x double] containing one of the source operands.
54 /// \param __b
55 /// A 128-bit vector of [2 x double] containing one of the source operands.
56 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
57 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
58 /// from the upper 64 bits of the first source operand.
59 static __inline__ __m128d __DEFAULT_FN_ATTRS
60 _mm_add_sd(__m128d __a, __m128d __b)
61 {
62  __a[0] += __b[0];
63  return __a;
64 }
65 
66 /// Adds two 128-bit vectors of [2 x double].
67 ///
68 /// \headerfile <x86intrin.h>
69 ///
70 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
71 ///
72 /// \param __a
73 /// A 128-bit vector of [2 x double] containing one of the source operands.
74 /// \param __b
75 /// A 128-bit vector of [2 x double] containing one of the source operands.
76 /// \returns A 128-bit vector of [2 x double] containing the sums of both
77 /// operands.
78 static __inline__ __m128d __DEFAULT_FN_ATTRS
79 _mm_add_pd(__m128d __a, __m128d __b)
80 {
81  return (__m128d)((__v2df)__a + (__v2df)__b);
82 }
83 
84 /// Subtracts the lower double-precision value of the second operand
85 /// from the lower double-precision value of the first operand and returns
86 /// the difference in the lower 64 bits of the result. The upper 64 bits of
87 /// the result are copied from the upper double-precision value of the first
88 /// operand.
89 ///
90 /// \headerfile <x86intrin.h>
91 ///
92 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
93 ///
94 /// \param __a
95 /// A 128-bit vector of [2 x double] containing the minuend.
96 /// \param __b
97 /// A 128-bit vector of [2 x double] containing the subtrahend.
98 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
99 /// difference of the lower 64 bits of both operands. The upper 64 bits are
100 /// copied from the upper 64 bits of the first source operand.
101 static __inline__ __m128d __DEFAULT_FN_ATTRS
102 _mm_sub_sd(__m128d __a, __m128d __b)
103 {
104  __a[0] -= __b[0];
105  return __a;
106 }
107 
108 /// Subtracts two 128-bit vectors of [2 x double].
109 ///
110 /// \headerfile <x86intrin.h>
111 ///
112 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
113 ///
114 /// \param __a
115 /// A 128-bit vector of [2 x double] containing the minuend.
116 /// \param __b
117 /// A 128-bit vector of [2 x double] containing the subtrahend.
118 /// \returns A 128-bit vector of [2 x double] containing the differences between
119 /// both operands.
120 static __inline__ __m128d __DEFAULT_FN_ATTRS
121 _mm_sub_pd(__m128d __a, __m128d __b)
122 {
123  return (__m128d)((__v2df)__a - (__v2df)__b);
124 }
125 
126 /// Multiplies lower double-precision values in both operands and returns
127 /// the product in the lower 64 bits of the result. The upper 64 bits of the
128 /// result are copied from the upper double-precision value of the first
129 /// operand.
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
134 ///
135 /// \param __a
136 /// A 128-bit vector of [2 x double] containing one of the source operands.
137 /// \param __b
138 /// A 128-bit vector of [2 x double] containing one of the source operands.
139 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
140 /// product of the lower 64 bits of both operands. The upper 64 bits are
141 /// copied from the upper 64 bits of the first source operand.
142 static __inline__ __m128d __DEFAULT_FN_ATTRS
143 _mm_mul_sd(__m128d __a, __m128d __b)
144 {
145  __a[0] *= __b[0];
146  return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [2 x double].
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
154 ///
155 /// \param __a
156 /// A 128-bit vector of [2 x double] containing one of the operands.
157 /// \param __b
158 /// A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
160 /// operands.
161 static __inline__ __m128d __DEFAULT_FN_ATTRS
162 _mm_mul_pd(__m128d __a, __m128d __b)
163 {
164  return (__m128d)((__v2df)__a * (__v2df)__b);
165 }
166 
167 /// Divides the lower double-precision value of the first operand by the
168 /// lower double-precision value of the second operand and returns the
169 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
170 /// result are copied from the upper double-precision value of the first
171 /// operand.
172 ///
173 /// \headerfile <x86intrin.h>
174 ///
175 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
176 ///
177 /// \param __a
178 /// A 128-bit vector of [2 x double] containing the dividend.
179 /// \param __b
180 /// A 128-bit vector of [2 x double] containing divisor.
181 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
182 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
183 /// copied from the upper 64 bits of the first source operand.
184 static __inline__ __m128d __DEFAULT_FN_ATTRS
185 _mm_div_sd(__m128d __a, __m128d __b)
186 {
187  __a[0] /= __b[0];
188  return __a;
189 }
190 
191 /// Performs an element-by-element division of two 128-bit vectors of
192 /// [2 x double].
193 ///
194 /// \headerfile <x86intrin.h>
195 ///
196 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
197 ///
198 /// \param __a
199 /// A 128-bit vector of [2 x double] containing the dividend.
200 /// \param __b
201 /// A 128-bit vector of [2 x double] containing the divisor.
202 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
203 /// operands.
204 static __inline__ __m128d __DEFAULT_FN_ATTRS
205 _mm_div_pd(__m128d __a, __m128d __b)
206 {
207  return (__m128d)((__v2df)__a / (__v2df)__b);
208 }
209 
210 /// Calculates the square root of the lower double-precision value of
211 /// the second operand and returns it in the lower 64 bits of the result.
212 /// The upper 64 bits of the result are copied from the upper
213 /// double-precision value of the first operand.
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
218 ///
219 /// \param __a
220 /// A 128-bit vector of [2 x double] containing one of the operands. The
221 /// upper 64 bits of this operand are copied to the upper 64 bits of the
222 /// result.
223 /// \param __b
224 /// A 128-bit vector of [2 x double] containing one of the operands. The
225 /// square root is calculated using the lower 64 bits of this operand.
226 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
227 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
228 /// bits are copied from the upper 64 bits of operand \a __a.
229 static __inline__ __m128d __DEFAULT_FN_ATTRS
230 _mm_sqrt_sd(__m128d __a, __m128d __b)
231 {
232  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
233  return __extension__ (__m128d) { __c[0], __a[1] };
234 }
235 
236 /// Calculates the square root of the each of two values stored in a
237 /// 128-bit vector of [2 x double].
238 ///
239 /// \headerfile <x86intrin.h>
240 ///
241 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
242 ///
243 /// \param __a
244 /// A 128-bit vector of [2 x double].
245 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
246 /// values in the operand.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS
248 _mm_sqrt_pd(__m128d __a)
249 {
250  return __builtin_ia32_sqrtpd((__v2df)__a);
251 }
252 
253 /// Compares lower 64-bit double-precision values of both operands, and
254 /// returns the lesser of the pair of values in the lower 64-bits of the
255 /// result. The upper 64 bits of the result are copied from the upper
256 /// double-precision value of the first operand.
257 ///
258 /// \headerfile <x86intrin.h>
259 ///
260 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
261 ///
262 /// \param __a
263 /// A 128-bit vector of [2 x double] containing one of the operands. The
264 /// lower 64 bits of this operand are used in the comparison.
265 /// \param __b
266 /// A 128-bit vector of [2 x double] containing one of the operands. The
267 /// lower 64 bits of this operand are used in the comparison.
268 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
269 /// minimum value between both operands. The upper 64 bits are copied from
270 /// the upper 64 bits of the first source operand.
271 static __inline__ __m128d __DEFAULT_FN_ATTRS
272 _mm_min_sd(__m128d __a, __m128d __b)
273 {
274  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
275 }
276 
277 /// Performs element-by-element comparison of the two 128-bit vectors of
278 /// [2 x double] and returns the vector containing the lesser of each pair of
279 /// values.
280 ///
281 /// \headerfile <x86intrin.h>
282 ///
283 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
284 ///
285 /// \param __a
286 /// A 128-bit vector of [2 x double] containing one of the operands.
287 /// \param __b
288 /// A 128-bit vector of [2 x double] containing one of the operands.
289 /// \returns A 128-bit vector of [2 x double] containing the minimum values
290 /// between both operands.
291 static __inline__ __m128d __DEFAULT_FN_ATTRS
292 _mm_min_pd(__m128d __a, __m128d __b)
293 {
294  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
295 }
296 
297 /// Compares lower 64-bit double-precision values of both operands, and
298 /// returns the greater of the pair of values in the lower 64-bits of the
299 /// result. The upper 64 bits of the result are copied from the upper
300 /// double-precision value of the first operand.
301 ///
302 /// \headerfile <x86intrin.h>
303 ///
304 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
305 ///
306 /// \param __a
307 /// A 128-bit vector of [2 x double] containing one of the operands. The
308 /// lower 64 bits of this operand are used in the comparison.
309 /// \param __b
310 /// A 128-bit vector of [2 x double] containing one of the operands. The
311 /// lower 64 bits of this operand are used in the comparison.
312 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
313 /// maximum value between both operands. The upper 64 bits are copied from
314 /// the upper 64 bits of the first source operand.
315 static __inline__ __m128d __DEFAULT_FN_ATTRS
316 _mm_max_sd(__m128d __a, __m128d __b)
317 {
318  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
319 }
320 
321 /// Performs element-by-element comparison of the two 128-bit vectors of
322 /// [2 x double] and returns the vector containing the greater of each pair
323 /// of values.
324 ///
325 /// \headerfile <x86intrin.h>
326 ///
327 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
328 ///
329 /// \param __a
330 /// A 128-bit vector of [2 x double] containing one of the operands.
331 /// \param __b
332 /// A 128-bit vector of [2 x double] containing one of the operands.
333 /// \returns A 128-bit vector of [2 x double] containing the maximum values
334 /// between both operands.
335 static __inline__ __m128d __DEFAULT_FN_ATTRS
336 _mm_max_pd(__m128d __a, __m128d __b)
337 {
338  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340 
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 /// A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 /// A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 /// values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS
354 _mm_and_pd(__m128d __a, __m128d __b)
355 {
356  return (__m128d)((__v2du)__a & (__v2du)__b);
357 }
358 
359 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
360 /// the one's complement of the values contained in the first source operand.
361 ///
362 /// \headerfile <x86intrin.h>
363 ///
364 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
365 ///
366 /// \param __a
367 /// A 128-bit vector of [2 x double] containing the left source operand. The
368 /// one's complement of this value is used in the bitwise AND.
369 /// \param __b
370 /// A 128-bit vector of [2 x double] containing the right source operand.
371 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
372 /// values in the second operand and the one's complement of the first
373 /// operand.
374 static __inline__ __m128d __DEFAULT_FN_ATTRS
375 _mm_andnot_pd(__m128d __a, __m128d __b)
376 {
377  return (__m128d)(~(__v2du)__a & (__v2du)__b);
378 }
379 
380 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
381 ///
382 /// \headerfile <x86intrin.h>
383 ///
384 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
385 ///
386 /// \param __a
387 /// A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \param __b
389 /// A 128-bit vector of [2 x double] containing one of the source operands.
390 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
391 /// values between both operands.
392 static __inline__ __m128d __DEFAULT_FN_ATTRS
393 _mm_or_pd(__m128d __a, __m128d __b)
394 {
395  return (__m128d)((__v2du)__a | (__v2du)__b);
396 }
397 
398 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
399 ///
400 /// \headerfile <x86intrin.h>
401 ///
402 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
403 ///
404 /// \param __a
405 /// A 128-bit vector of [2 x double] containing one of the source operands.
406 /// \param __b
407 /// A 128-bit vector of [2 x double] containing one of the source operands.
408 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
409 /// values between both operands.
410 static __inline__ __m128d __DEFAULT_FN_ATTRS
411 _mm_xor_pd(__m128d __a, __m128d __b)
412 {
413  return (__m128d)((__v2du)__a ^ (__v2du)__b);
414 }
415 
416 /// Compares each of the corresponding double-precision values of the
417 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
418 /// for false, 0xFFFFFFFFFFFFFFFF for true.
419 ///
420 /// \headerfile <x86intrin.h>
421 ///
422 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
423 ///
424 /// \param __a
425 /// A 128-bit vector of [2 x double].
426 /// \param __b
427 /// A 128-bit vector of [2 x double].
428 /// \returns A 128-bit vector containing the comparison results.
429 static __inline__ __m128d __DEFAULT_FN_ATTRS
430 _mm_cmpeq_pd(__m128d __a, __m128d __b)
431 {
432  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
433 }
434 
435 /// Compares each of the corresponding double-precision values of the
436 /// 128-bit vectors of [2 x double] to determine if the values in the first
437 /// operand are less than those in the second operand. Each comparison
438 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
439 ///
440 /// \headerfile <x86intrin.h>
441 ///
442 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
443 ///
444 /// \param __a
445 /// A 128-bit vector of [2 x double].
446 /// \param __b
447 /// A 128-bit vector of [2 x double].
448 /// \returns A 128-bit vector containing the comparison results.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS
450 _mm_cmplt_pd(__m128d __a, __m128d __b)
451 {
452  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
453 }
454 
455 /// Compares each of the corresponding double-precision values of the
456 /// 128-bit vectors of [2 x double] to determine if the values in the first
457 /// operand are less than or equal to those in the second operand.
458 ///
459 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
460 ///
461 /// \headerfile <x86intrin.h>
462 ///
463 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
464 ///
465 /// \param __a
466 /// A 128-bit vector of [2 x double].
467 /// \param __b
468 /// A 128-bit vector of [2 x double].
469 /// \returns A 128-bit vector containing the comparison results.
470 static __inline__ __m128d __DEFAULT_FN_ATTRS
471 _mm_cmple_pd(__m128d __a, __m128d __b)
472 {
473  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
474 }
475 
476 /// Compares each of the corresponding double-precision values of the
477 /// 128-bit vectors of [2 x double] to determine if the values in the first
478 /// operand are greater than those in the second operand.
479 ///
480 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
481 ///
482 /// \headerfile <x86intrin.h>
483 ///
484 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
485 ///
486 /// \param __a
487 /// A 128-bit vector of [2 x double].
488 /// \param __b
489 /// A 128-bit vector of [2 x double].
490 /// \returns A 128-bit vector containing the comparison results.
491 static __inline__ __m128d __DEFAULT_FN_ATTRS
492 _mm_cmpgt_pd(__m128d __a, __m128d __b)
493 {
494  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
495 }
496 
497 /// Compares each of the corresponding double-precision values of the
498 /// 128-bit vectors of [2 x double] to determine if the values in the first
499 /// operand are greater than or equal to those in the second operand.
500 ///
501 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
502 ///
503 /// \headerfile <x86intrin.h>
504 ///
505 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
506 ///
507 /// \param __a
508 /// A 128-bit vector of [2 x double].
509 /// \param __b
510 /// A 128-bit vector of [2 x double].
511 /// \returns A 128-bit vector containing the comparison results.
512 static __inline__ __m128d __DEFAULT_FN_ATTRS
513 _mm_cmpge_pd(__m128d __a, __m128d __b)
514 {
515  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
516 }
517 
518 /// Compares each of the corresponding double-precision values of the
519 /// 128-bit vectors of [2 x double] to determine if the values in the first
520 /// operand are ordered with respect to those in the second operand.
521 ///
522 /// A pair of double-precision values are "ordered" with respect to each
523 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
524 /// 0xFFFFFFFFFFFFFFFF for true.
525 ///
526 /// \headerfile <x86intrin.h>
527 ///
528 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
529 ///
530 /// \param __a
531 /// A 128-bit vector of [2 x double].
532 /// \param __b
533 /// A 128-bit vector of [2 x double].
534 /// \returns A 128-bit vector containing the comparison results.
535 static __inline__ __m128d __DEFAULT_FN_ATTRS
536 _mm_cmpord_pd(__m128d __a, __m128d __b)
537 {
538  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
539 }
540 
541 /// Compares each of the corresponding double-precision values of the
542 /// 128-bit vectors of [2 x double] to determine if the values in the first
543 /// operand are unordered with respect to those in the second operand.
544 ///
545 /// A pair of double-precision values are "unordered" with respect to each
546 /// other if one or both values are NaN. Each comparison yields 0x0 for
547 /// false, 0xFFFFFFFFFFFFFFFF for true.
548 ///
549 /// \headerfile <x86intrin.h>
550 ///
551 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
552 /// instruction.
553 ///
554 /// \param __a
555 /// A 128-bit vector of [2 x double].
556 /// \param __b
557 /// A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS
560 _mm_cmpunord_pd(__m128d __a, __m128d __b)
561 {
562  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
563 }
564 
565 /// Compares each of the corresponding double-precision values of the
566 /// 128-bit vectors of [2 x double] to determine if the values in the first
567 /// operand are unequal to those in the second operand.
568 ///
569 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
570 ///
571 /// \headerfile <x86intrin.h>
572 ///
573 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
574 ///
575 /// \param __a
576 /// A 128-bit vector of [2 x double].
577 /// \param __b
578 /// A 128-bit vector of [2 x double].
579 /// \returns A 128-bit vector containing the comparison results.
580 static __inline__ __m128d __DEFAULT_FN_ATTRS
581 _mm_cmpneq_pd(__m128d __a, __m128d __b)
582 {
583  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
584 }
585 
586 /// Compares each of the corresponding double-precision values of the
587 /// 128-bit vectors of [2 x double] to determine if the values in the first
588 /// operand are not less than those in the second operand.
589 ///
590 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
591 ///
592 /// \headerfile <x86intrin.h>
593 ///
594 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
595 ///
596 /// \param __a
597 /// A 128-bit vector of [2 x double].
598 /// \param __b
599 /// A 128-bit vector of [2 x double].
600 /// \returns A 128-bit vector containing the comparison results.
601 static __inline__ __m128d __DEFAULT_FN_ATTRS
602 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
603 {
604  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
605 }
606 
607 /// Compares each of the corresponding double-precision values of the
608 /// 128-bit vectors of [2 x double] to determine if the values in the first
609 /// operand are not less than or equal to those in the second operand.
610 ///
611 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
612 ///
613 /// \headerfile <x86intrin.h>
614 ///
615 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
616 ///
617 /// \param __a
618 /// A 128-bit vector of [2 x double].
619 /// \param __b
620 /// A 128-bit vector of [2 x double].
621 /// \returns A 128-bit vector containing the comparison results.
622 static __inline__ __m128d __DEFAULT_FN_ATTRS
623 _mm_cmpnle_pd(__m128d __a, __m128d __b)
624 {
625  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
626 }
627 
628 /// Compares each of the corresponding double-precision values of the
629 /// 128-bit vectors of [2 x double] to determine if the values in the first
630 /// operand are not greater than those in the second operand.
631 ///
632 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
633 ///
634 /// \headerfile <x86intrin.h>
635 ///
636 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
637 ///
638 /// \param __a
639 /// A 128-bit vector of [2 x double].
640 /// \param __b
641 /// A 128-bit vector of [2 x double].
642 /// \returns A 128-bit vector containing the comparison results.
643 static __inline__ __m128d __DEFAULT_FN_ATTRS
644 _mm_cmpngt_pd(__m128d __a, __m128d __b)
645 {
646  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
647 }
648 
649 /// Compares each of the corresponding double-precision values of the
650 /// 128-bit vectors of [2 x double] to determine if the values in the first
651 /// operand are not greater than or equal to those in the second operand.
652 ///
653 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
654 ///
655 /// \headerfile <x86intrin.h>
656 ///
657 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
658 ///
659 /// \param __a
660 /// A 128-bit vector of [2 x double].
661 /// \param __b
662 /// A 128-bit vector of [2 x double].
663 /// \returns A 128-bit vector containing the comparison results.
664 static __inline__ __m128d __DEFAULT_FN_ATTRS
665 _mm_cmpnge_pd(__m128d __a, __m128d __b)
666 {
667  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
668 }
669 
670 /// Compares the lower double-precision floating-point values in each of
671 /// the two 128-bit floating-point vectors of [2 x double] for equality.
672 ///
673 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
674 ///
675 /// \headerfile <x86intrin.h>
676 ///
677 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
678 ///
679 /// \param __a
680 /// A 128-bit vector of [2 x double]. The lower double-precision value is
681 /// compared to the lower double-precision value of \a __b.
682 /// \param __b
683 /// A 128-bit vector of [2 x double]. The lower double-precision value is
684 /// compared to the lower double-precision value of \a __a.
685 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
686 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
687 static __inline__ __m128d __DEFAULT_FN_ATTRS
688 _mm_cmpeq_sd(__m128d __a, __m128d __b)
689 {
690  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
691 }
692 
693 /// Compares the lower double-precision floating-point values in each of
694 /// the two 128-bit floating-point vectors of [2 x double] to determine if
695 /// the value in the first parameter is less than the corresponding value in
696 /// the second parameter.
697 ///
698 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
699 ///
700 /// \headerfile <x86intrin.h>
701 ///
702 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
703 ///
704 /// \param __a
705 /// A 128-bit vector of [2 x double]. The lower double-precision value is
706 /// compared to the lower double-precision value of \a __b.
707 /// \param __b
708 /// A 128-bit vector of [2 x double]. The lower double-precision value is
709 /// compared to the lower double-precision value of \a __a.
710 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
711 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
712 static __inline__ __m128d __DEFAULT_FN_ATTRS
713 _mm_cmplt_sd(__m128d __a, __m128d __b)
714 {
715  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
716 }
717 
718 /// Compares the lower double-precision floating-point values in each of
719 /// the two 128-bit floating-point vectors of [2 x double] to determine if
720 /// the value in the first parameter is less than or equal to the
721 /// corresponding value in the second parameter.
722 ///
723 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
724 ///
725 /// \headerfile <x86intrin.h>
726 ///
727 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
728 ///
729 /// \param __a
730 /// A 128-bit vector of [2 x double]. The lower double-precision value is
731 /// compared to the lower double-precision value of \a __b.
732 /// \param __b
733 /// A 128-bit vector of [2 x double]. The lower double-precision value is
734 /// compared to the lower double-precision value of \a __a.
735 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
736 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
737 static __inline__ __m128d __DEFAULT_FN_ATTRS
738 _mm_cmple_sd(__m128d __a, __m128d __b)
739 {
740  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
741 }
742 
743 /// Compares the lower double-precision floating-point values in each of
744 /// the two 128-bit floating-point vectors of [2 x double] to determine if
745 /// the value in the first parameter is greater than the corresponding value
746 /// in the second parameter.
747 ///
748 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
749 ///
750 /// \headerfile <x86intrin.h>
751 ///
752 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
753 ///
754 /// \param __a
755 /// A 128-bit vector of [2 x double]. The lower double-precision value is
756 /// compared to the lower double-precision value of \a __b.
757 /// \param __b
758 /// A 128-bit vector of [2 x double]. The lower double-precision value is
759 /// compared to the lower double-precision value of \a __a.
760 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
761 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
762 static __inline__ __m128d __DEFAULT_FN_ATTRS
763 _mm_cmpgt_sd(__m128d __a, __m128d __b)
764 {
765  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
766  return __extension__ (__m128d) { __c[0], __a[1] };
767 }
768 
769 /// Compares the lower double-precision floating-point values in each of
770 /// the two 128-bit floating-point vectors of [2 x double] to determine if
771 /// the value in the first parameter is greater than or equal to the
772 /// corresponding value in the second parameter.
773 ///
774 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
775 ///
776 /// \headerfile <x86intrin.h>
777 ///
778 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
779 ///
780 /// \param __a
781 /// A 128-bit vector of [2 x double]. The lower double-precision value is
782 /// compared to the lower double-precision value of \a __b.
783 /// \param __b
784 /// A 128-bit vector of [2 x double]. The lower double-precision value is
785 /// compared to the lower double-precision value of \a __a.
786 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
787 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
788 static __inline__ __m128d __DEFAULT_FN_ATTRS
789 _mm_cmpge_sd(__m128d __a, __m128d __b)
790 {
791  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
792  return __extension__ (__m128d) { __c[0], __a[1] };
793 }
794 
795 /// Compares the lower double-precision floating-point values in each of
796 /// the two 128-bit floating-point vectors of [2 x double] to determine if
797 /// the value in the first parameter is "ordered" with respect to the
798 /// corresponding value in the second parameter.
799 ///
800 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
801 /// of double-precision values are "ordered" with respect to each other if
802 /// neither value is a NaN.
803 ///
804 /// \headerfile <x86intrin.h>
805 ///
806 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
807 ///
808 /// \param __a
809 /// A 128-bit vector of [2 x double]. The lower double-precision value is
810 /// compared to the lower double-precision value of \a __b.
811 /// \param __b
812 /// A 128-bit vector of [2 x double]. The lower double-precision value is
813 /// compared to the lower double-precision value of \a __a.
814 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
815 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
816 static __inline__ __m128d __DEFAULT_FN_ATTRS
817 _mm_cmpord_sd(__m128d __a, __m128d __b)
818 {
819  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
820 }
821 
822 /// Compares the lower double-precision floating-point values in each of
823 /// the two 128-bit floating-point vectors of [2 x double] to determine if
824 /// the value in the first parameter is "unordered" with respect to the
825 /// corresponding value in the second parameter.
826 ///
827 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
828 /// of double-precision values are "unordered" with respect to each other if
829 /// one or both values are NaN.
830 ///
831 /// \headerfile <x86intrin.h>
832 ///
833 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
834 /// instruction.
835 ///
836 /// \param __a
837 /// A 128-bit vector of [2 x double]. The lower double-precision value is
838 /// compared to the lower double-precision value of \a __b.
839 /// \param __b
840 /// A 128-bit vector of [2 x double]. The lower double-precision value is
841 /// compared to the lower double-precision value of \a __a.
842 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
843 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
844 static __inline__ __m128d __DEFAULT_FN_ATTRS
845 _mm_cmpunord_sd(__m128d __a, __m128d __b)
846 {
847  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
848 }
849 
850 /// Compares the lower double-precision floating-point values in each of
851 /// the two 128-bit floating-point vectors of [2 x double] to determine if
852 /// the value in the first parameter is unequal to the corresponding value in
853 /// the second parameter.
854 ///
855 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
856 ///
857 /// \headerfile <x86intrin.h>
858 ///
859 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
860 ///
861 /// \param __a
862 /// A 128-bit vector of [2 x double]. The lower double-precision value is
863 /// compared to the lower double-precision value of \a __b.
864 /// \param __b
865 /// A 128-bit vector of [2 x double]. The lower double-precision value is
866 /// compared to the lower double-precision value of \a __a.
867 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
868 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
869 static __inline__ __m128d __DEFAULT_FN_ATTRS
870 _mm_cmpneq_sd(__m128d __a, __m128d __b)
871 {
872  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
873 }
874 
875 /// Compares the lower double-precision floating-point values in each of
876 /// the two 128-bit floating-point vectors of [2 x double] to determine if
877 /// the value in the first parameter is not less than the corresponding
878 /// value in the second parameter.
879 ///
880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
885 ///
886 /// \param __a
887 /// A 128-bit vector of [2 x double]. The lower double-precision value is
888 /// compared to the lower double-precision value of \a __b.
889 /// \param __b
890 /// A 128-bit vector of [2 x double]. The lower double-precision value is
891 /// compared to the lower double-precision value of \a __a.
892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS
895 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
896 {
897  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
898 }
899 
900 /// Compares the lower double-precision floating-point values in each of
901 /// the two 128-bit floating-point vectors of [2 x double] to determine if
902 /// the value in the first parameter is not less than or equal to the
903 /// corresponding value in the second parameter.
904 ///
905 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
906 ///
907 /// \headerfile <x86intrin.h>
908 ///
909 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
910 ///
911 /// \param __a
912 /// A 128-bit vector of [2 x double]. The lower double-precision value is
913 /// compared to the lower double-precision value of \a __b.
914 /// \param __b
915 /// A 128-bit vector of [2 x double]. The lower double-precision value is
916 /// compared to the lower double-precision value of \a __a.
917 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
918 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
919 static __inline__ __m128d __DEFAULT_FN_ATTRS
920 _mm_cmpnle_sd(__m128d __a, __m128d __b)
921 {
922  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
923 }
924 
925 /// Compares the lower double-precision floating-point values in each of
926 /// the two 128-bit floating-point vectors of [2 x double] to determine if
927 /// the value in the first parameter is not greater than the corresponding
928 /// value in the second parameter.
929 ///
930 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
931 ///
932 /// \headerfile <x86intrin.h>
933 ///
934 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
935 ///
936 /// \param __a
937 /// A 128-bit vector of [2 x double]. The lower double-precision value is
938 /// compared to the lower double-precision value of \a __b.
939 /// \param __b
940 /// A 128-bit vector of [2 x double]. The lower double-precision value is
941 /// compared to the lower double-precision value of \a __a.
942 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
943 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
944 static __inline__ __m128d __DEFAULT_FN_ATTRS
945 _mm_cmpngt_sd(__m128d __a, __m128d __b)
946 {
947  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
948  return __extension__ (__m128d) { __c[0], __a[1] };
949 }
950 
951 /// Compares the lower double-precision floating-point values in each of
952 /// the two 128-bit floating-point vectors of [2 x double] to determine if
953 /// the value in the first parameter is not greater than or equal to the
954 /// corresponding value in the second parameter.
955 ///
956 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
957 ///
958 /// \headerfile <x86intrin.h>
959 ///
960 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
961 ///
962 /// \param __a
963 /// A 128-bit vector of [2 x double]. The lower double-precision value is
964 /// compared to the lower double-precision value of \a __b.
965 /// \param __b
966 /// A 128-bit vector of [2 x double]. The lower double-precision value is
967 /// compared to the lower double-precision value of \a __a.
968 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
969 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
970 static __inline__ __m128d __DEFAULT_FN_ATTRS
971 _mm_cmpnge_sd(__m128d __a, __m128d __b)
972 {
973  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
974  return __extension__ (__m128d) { __c[0], __a[1] };
975 }
976 
977 /// Compares the lower double-precision floating-point values in each of
978 /// the two 128-bit floating-point vectors of [2 x double] for equality.
979 ///
980 /// The comparison yields 0 for false, 1 for true. If either of the two
981 /// lower double-precision values is NaN, 0 is returned.
982 ///
983 /// \headerfile <x86intrin.h>
984 ///
985 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
986 ///
987 /// \param __a
988 /// A 128-bit vector of [2 x double]. The lower double-precision value is
989 /// compared to the lower double-precision value of \a __b.
990 /// \param __b
991 /// A 128-bit vector of [2 x double]. The lower double-precision value is
992 /// compared to the lower double-precision value of \a __a.
993 /// \returns An integer containing the comparison results. If either of the two
994 /// lower double-precision values is NaN, 0 is returned.
995 static __inline__ int __DEFAULT_FN_ATTRS
996 _mm_comieq_sd(__m128d __a, __m128d __b)
997 {
998  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
999 }
1000 
1001 /// Compares the lower double-precision floating-point values in each of
1002 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1003 /// the value in the first parameter is less than the corresponding value in
1004 /// the second parameter.
1005 ///
1006 /// The comparison yields 0 for false, 1 for true. If either of the two
1007 /// lower double-precision values is NaN, 0 is returned.
1008 ///
1009 /// \headerfile <x86intrin.h>
1010 ///
1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1012 ///
1013 /// \param __a
1014 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1015 /// compared to the lower double-precision value of \a __b.
1016 /// \param __b
1017 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1018 /// compared to the lower double-precision value of \a __a.
1019 /// \returns An integer containing the comparison results. If either of the two
1020 /// lower double-precision values is NaN, 0 is returned.
1021 static __inline__ int __DEFAULT_FN_ATTRS
1022 _mm_comilt_sd(__m128d __a, __m128d __b)
1023 {
1024  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1025 }
1026 
1027 /// Compares the lower double-precision floating-point values in each of
1028 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1029 /// the value in the first parameter is less than or equal to the
1030 /// corresponding value in the second parameter.
1031 ///
1032 /// The comparison yields 0 for false, 1 for true. If either of the two
1033 /// lower double-precision values is NaN, 0 is returned.
1034 ///
1035 /// \headerfile <x86intrin.h>
1036 ///
1037 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1038 ///
1039 /// \param __a
1040 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1041 /// compared to the lower double-precision value of \a __b.
1042 /// \param __b
1043 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1044 /// compared to the lower double-precision value of \a __a.
1045 /// \returns An integer containing the comparison results. If either of the two
1046 /// lower double-precision values is NaN, 0 is returned.
1047 static __inline__ int __DEFAULT_FN_ATTRS
1048 _mm_comile_sd(__m128d __a, __m128d __b)
1049 {
1050  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1051 }
1052 
1053 /// Compares the lower double-precision floating-point values in each of
1054 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1055 /// the value in the first parameter is greater than the corresponding value
1056 /// in the second parameter.
1057 ///
1058 /// The comparison yields 0 for false, 1 for true. If either of the two
1059 /// lower double-precision values is NaN, 0 is returned.
1060 ///
1061 /// \headerfile <x86intrin.h>
1062 ///
1063 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1064 ///
1065 /// \param __a
1066 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1067 /// compared to the lower double-precision value of \a __b.
1068 /// \param __b
1069 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1070 /// compared to the lower double-precision value of \a __a.
1071 /// \returns An integer containing the comparison results. If either of the two
1072 /// lower double-precision values is NaN, 0 is returned.
1073 static __inline__ int __DEFAULT_FN_ATTRS
1074 _mm_comigt_sd(__m128d __a, __m128d __b)
1075 {
1076  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1077 }
1078 
1079 /// Compares the lower double-precision floating-point values in each of
1080 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1081 /// the value in the first parameter is greater than or equal to the
1082 /// corresponding value in the second parameter.
1083 ///
1084 /// The comparison yields 0 for false, 1 for true. If either of the two
1085 /// lower double-precision values is NaN, 0 is returned.
1086 ///
1087 /// \headerfile <x86intrin.h>
1088 ///
1089 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1090 ///
1091 /// \param __a
1092 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1093 /// compared to the lower double-precision value of \a __b.
1094 /// \param __b
1095 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1096 /// compared to the lower double-precision value of \a __a.
1097 /// \returns An integer containing the comparison results. If either of the two
1098 /// lower double-precision values is NaN, 0 is returned.
1099 static __inline__ int __DEFAULT_FN_ATTRS
1100 _mm_comige_sd(__m128d __a, __m128d __b)
1101 {
1102  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1103 }
1104 
1105 /// Compares the lower double-precision floating-point values in each of
1106 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1107 /// the value in the first parameter is unequal to the corresponding value in
1108 /// the second parameter.
1109 ///
1110 /// The comparison yields 0 for false, 1 for true. If either of the two
1111 /// lower double-precision values is NaN, 1 is returned.
1112 ///
1113 /// \headerfile <x86intrin.h>
1114 ///
1115 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1116 ///
1117 /// \param __a
1118 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1119 /// compared to the lower double-precision value of \a __b.
1120 /// \param __b
1121 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1122 /// compared to the lower double-precision value of \a __a.
1123 /// \returns An integer containing the comparison results. If either of the two
1124 /// lower double-precision values is NaN, 1 is returned.
1125 static __inline__ int __DEFAULT_FN_ATTRS
1126 _mm_comineq_sd(__m128d __a, __m128d __b)
1127 {
1128  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1129 }
1130 
1131 /// Compares the lower double-precision floating-point values in each of
1132 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1133 /// comparison yields 0 for false, 1 for true.
1134 ///
1135 /// If either of the two lower double-precision values is NaN, 0 is returned.
1136 ///
1137 /// \headerfile <x86intrin.h>
1138 ///
1139 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1140 ///
1141 /// \param __a
1142 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1143 /// compared to the lower double-precision value of \a __b.
1144 /// \param __b
1145 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1146 /// compared to the lower double-precision value of \a __a.
1147 /// \returns An integer containing the comparison results. If either of the two
1148 /// lower double-precision values is NaN, 0 is returned.
1149 static __inline__ int __DEFAULT_FN_ATTRS
1150 _mm_ucomieq_sd(__m128d __a, __m128d __b)
1151 {
1152  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1153 }
1154 
1155 /// Compares the lower double-precision floating-point values in each of
1156 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1157 /// the value in the first parameter is less than the corresponding value in
1158 /// the second parameter.
1159 ///
1160 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1161 /// double-precision values is NaN, 0 is returned.
1162 ///
1163 /// \headerfile <x86intrin.h>
1164 ///
1165 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1166 ///
1167 /// \param __a
1168 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1169 /// compared to the lower double-precision value of \a __b.
1170 /// \param __b
1171 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1172 /// compared to the lower double-precision value of \a __a.
1173 /// \returns An integer containing the comparison results. If either of the two
1174 /// lower double-precision values is NaN, 0 is returned.
1175 static __inline__ int __DEFAULT_FN_ATTRS
1176 _mm_ucomilt_sd(__m128d __a, __m128d __b)
1177 {
1178  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1179 }
1180 
1181 /// Compares the lower double-precision floating-point values in each of
1182 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1183 /// the value in the first parameter is less than or equal to the
1184 /// corresponding value in the second parameter.
1185 ///
1186 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1187 /// double-precision values is NaN, 0 is returned.
1188 ///
1189 /// \headerfile <x86intrin.h>
1190 ///
1191 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1192 ///
1193 /// \param __a
1194 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1195 /// compared to the lower double-precision value of \a __b.
1196 /// \param __b
1197 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1198 /// compared to the lower double-precision value of \a __a.
1199 /// \returns An integer containing the comparison results. If either of the two
1200 /// lower double-precision values is NaN, 0 is returned.
1201 static __inline__ int __DEFAULT_FN_ATTRS
1202 _mm_ucomile_sd(__m128d __a, __m128d __b)
1203 {
1204  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1205 }
1206 
1207 /// Compares the lower double-precision floating-point values in each of
1208 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1209 /// the value in the first parameter is greater than the corresponding value
1210 /// in the second parameter.
1211 ///
1212 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1213 /// double-precision values is NaN, 0 is returned.
1214 ///
1215 /// \headerfile <x86intrin.h>
1216 ///
1217 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1218 ///
1219 /// \param __a
1220 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1221 /// compared to the lower double-precision value of \a __b.
1222 /// \param __b
1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1224 /// compared to the lower double-precision value of \a __a.
1225 /// \returns An integer containing the comparison results. If either of the two
1226 /// lower double-precision values is NaN, 0 is returned.
1227 static __inline__ int __DEFAULT_FN_ATTRS
1228 _mm_ucomigt_sd(__m128d __a, __m128d __b)
1229 {
1230  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1231 }
1232 
1233 /// Compares the lower double-precision floating-point values in each of
1234 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1235 /// the value in the first parameter is greater than or equal to the
1236 /// corresponding value in the second parameter.
1237 ///
1238 /// The comparison yields 0 for false, 1 for true. If either of the two
1239 /// lower double-precision values is NaN, 0 is returned.
1240 ///
1241 /// \headerfile <x86intrin.h>
1242 ///
1243 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1244 ///
1245 /// \param __a
1246 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1247 /// compared to the lower double-precision value of \a __b.
1248 /// \param __b
1249 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1250 /// compared to the lower double-precision value of \a __a.
1251 /// \returns An integer containing the comparison results. If either of the two
1252 /// lower double-precision values is NaN, 0 is returned.
1253 static __inline__ int __DEFAULT_FN_ATTRS
1254 _mm_ucomige_sd(__m128d __a, __m128d __b)
1255 {
1256  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1257 }
1258 
1259 /// Compares the lower double-precision floating-point values in each of
1260 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1261 /// the value in the first parameter is unequal to the corresponding value in
1262 /// the second parameter.
1263 ///
1264 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1265 /// double-precision values is NaN, 1 is returned.
1266 ///
1267 /// \headerfile <x86intrin.h>
1268 ///
1269 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1270 ///
1271 /// \param __a
1272 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1273 /// compared to the lower double-precision value of \a __b.
1274 /// \param __b
1275 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1276 /// compared to the lower double-precision value of \a __a.
1277 /// \returns An integer containing the comparison result. If either of the two
1278 /// lower double-precision values is NaN, 1 is returned.
1279 static __inline__ int __DEFAULT_FN_ATTRS
1280 _mm_ucomineq_sd(__m128d __a, __m128d __b)
1281 {
1282  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1283 }
1284 
1285 /// Converts the two double-precision floating-point elements of a
1286 /// 128-bit vector of [2 x double] into two single-precision floating-point
1287 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1288 /// The upper 64 bits of the result vector are set to zero.
1289 ///
1290 /// \headerfile <x86intrin.h>
1291 ///
1292 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1293 ///
1294 /// \param __a
1295 /// A 128-bit vector of [2 x double].
1296 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1297 /// converted values. The upper 64 bits are set to zero.
1298 static __inline__ __m128 __DEFAULT_FN_ATTRS
1300 {
1301  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1302 }
1303 
1304 /// Converts the lower two single-precision floating-point elements of a
1305 /// 128-bit vector of [4 x float] into two double-precision floating-point
1306 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1307 /// elements of the input vector are unused.
1308 ///
1309 /// \headerfile <x86intrin.h>
1310 ///
1311 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1312 ///
1313 /// \param __a
1314 /// A 128-bit vector of [4 x float]. The lower two single-precision
1315 /// floating-point elements are converted to double-precision values. The
1316 /// upper two elements are unused.
1317 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1318 static __inline__ __m128d __DEFAULT_FN_ATTRS
1320 {
1321  return (__m128d) __builtin_convertvector(
1322  __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1323 }
1324 
1325 /// Converts the lower two integer elements of a 128-bit vector of
1326 /// [4 x i32] into two double-precision floating-point values, returned in a
1327 /// 128-bit vector of [2 x double].
1328 ///
1329 /// The upper two elements of the input vector are unused.
1330 ///
1331 /// \headerfile <x86intrin.h>
1332 ///
1333 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1334 ///
1335 /// \param __a
1336 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1337 /// converted to double-precision values.
1338 ///
1339 /// The upper two elements are unused.
1340 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1341 static __inline__ __m128d __DEFAULT_FN_ATTRS
1343 {
1344  return (__m128d) __builtin_convertvector(
1345  __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1346 }
1347 
1348 /// Converts the two double-precision floating-point elements of a
1349 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1350 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1351 /// 64 bits of the result vector are set to zero.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1356 ///
1357 /// \param __a
1358 /// A 128-bit vector of [2 x double].
1359 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1360 /// converted values. The upper 64 bits are set to zero.
1361 static __inline__ __m128i __DEFAULT_FN_ATTRS
1363 {
1364  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1365 }
1366 
1367 /// Converts the low-order element of a 128-bit vector of [2 x double]
1368 /// into a 32-bit signed integer value.
1369 ///
1370 /// \headerfile <x86intrin.h>
1371 ///
1372 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1373 ///
1374 /// \param __a
1375 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1376 /// conversion.
1377 /// \returns A 32-bit signed integer containing the converted value.
1378 static __inline__ int __DEFAULT_FN_ATTRS
1380 {
1381  return __builtin_ia32_cvtsd2si((__v2df)__a);
1382 }
1383 
1384 /// Converts the lower double-precision floating-point element of a
1385 /// 128-bit vector of [2 x double], in the second parameter, into a
1386 /// single-precision floating-point value, returned in the lower 32 bits of a
1387 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1388 /// copied from the upper 96 bits of the first parameter.
1389 ///
1390 /// \headerfile <x86intrin.h>
1391 ///
1392 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1393 ///
1394 /// \param __a
1395 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1396 /// copied to the upper 96 bits of the result.
1397 /// \param __b
1398 /// A 128-bit vector of [2 x double]. The lower double-precision
1399 /// floating-point element is used in the conversion.
1400 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1401 /// converted value from the second parameter. The upper 96 bits are copied
1402 /// from the upper 96 bits of the first parameter.
1403 static __inline__ __m128 __DEFAULT_FN_ATTRS
1404 _mm_cvtsd_ss(__m128 __a, __m128d __b)
1405 {
1406  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1407 }
1408 
1409 /// Converts a 32-bit signed integer value, in the second parameter, into
1410 /// a double-precision floating-point value, returned in the lower 64 bits of
1411 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1412 /// are copied from the upper 64 bits of the first parameter.
1413 ///
1414 /// \headerfile <x86intrin.h>
1415 ///
1416 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1417 ///
1418 /// \param __a
1419 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1420 /// copied to the upper 64 bits of the result.
1421 /// \param __b
1422 /// A 32-bit signed integer containing the value to be converted.
1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1424 /// converted value from the second parameter. The upper 64 bits are copied
1425 /// from the upper 64 bits of the first parameter.
1426 static __inline__ __m128d __DEFAULT_FN_ATTRS
1427 _mm_cvtsi32_sd(__m128d __a, int __b)
1428 {
1429  __a[0] = __b;
1430  return __a;
1431 }
1432 
1433 /// Converts the lower single-precision floating-point element of a
1434 /// 128-bit vector of [4 x float], in the second parameter, into a
1435 /// double-precision floating-point value, returned in the lower 64 bits of
1436 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1437 /// are copied from the upper 64 bits of the first parameter.
1438 ///
1439 /// \headerfile <x86intrin.h>
1440 ///
1441 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1442 ///
1443 /// \param __a
1444 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1445 /// copied to the upper 64 bits of the result.
1446 /// \param __b
1447 /// A 128-bit vector of [4 x float]. The lower single-precision
1448 /// floating-point element is used in the conversion.
1449 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1450 /// converted value from the second parameter. The upper 64 bits are copied
1451 /// from the upper 64 bits of the first parameter.
1452 static __inline__ __m128d __DEFAULT_FN_ATTRS
1453 _mm_cvtss_sd(__m128d __a, __m128 __b)
1454 {
1455  __a[0] = __b[0];
1456  return __a;
1457 }
1458 
1459 /// Converts the two double-precision floating-point elements of a
1460 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1461 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1462 ///
1463 /// If the result of either conversion is inexact, the result is truncated
1464 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1465 /// 64 bits of the result vector are set to zero.
1466 ///
1467 /// \headerfile <x86intrin.h>
1468 ///
1469 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1470 /// instruction.
1471 ///
1472 /// \param __a
1473 /// A 128-bit vector of [2 x double].
1474 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1475 /// converted values. The upper 64 bits are set to zero.
1476 static __inline__ __m128i __DEFAULT_FN_ATTRS
1478 {
1479  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1480 }
1481 
1482 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1483 /// signed integer value, truncating the result when it is inexact.
1484 ///
1485 /// \headerfile <x86intrin.h>
1486 ///
1487 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1488 /// instruction.
1489 ///
1490 /// \param __a
1491 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1492 /// conversion.
1493 /// \returns A 32-bit signed integer containing the converted value.
1494 static __inline__ int __DEFAULT_FN_ATTRS
1496 {
1497  return __builtin_ia32_cvttsd2si((__v2df)__a);
1498 }
1499 
1500 /// Converts the two double-precision floating-point elements of a
1501 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1502 /// returned in a 64-bit vector of [2 x i32].
1503 ///
1504 /// \headerfile <x86intrin.h>
1505 ///
1506 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1507 ///
1508 /// \param __a
1509 /// A 128-bit vector of [2 x double].
1510 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1511 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1513 {
1514  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1515 }
1516 
1517 /// Converts the two double-precision floating-point elements of a
1518 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1519 /// returned in a 64-bit vector of [2 x i32].
1520 ///
1521 /// If the result of either conversion is inexact, the result is truncated
1522 /// (rounded towards zero) regardless of the current MXCSR setting.
1523 ///
1524 /// \headerfile <x86intrin.h>
1525 ///
1526 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1527 ///
1528 /// \param __a
1529 /// A 128-bit vector of [2 x double].
1530 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1531 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1533 {
1534  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1535 }
1536 
1537 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1538 /// [2 x i32] into two double-precision floating-point values, returned in a
1539 /// 128-bit vector of [2 x double].
1540 ///
1541 /// \headerfile <x86intrin.h>
1542 ///
1543 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1544 ///
1545 /// \param __a
1546 /// A 64-bit vector of [2 x i32].
1547 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1548 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
1550 {
1551  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1552 }
1553 
1554 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1555 /// a double-precision floating-point value.
1556 ///
1557 /// \headerfile <x86intrin.h>
1558 ///
1559 /// This intrinsic has no corresponding instruction.
1560 ///
1561 /// \param __a
1562 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1563 /// \returns A double-precision floating-point value copied from the lower 64
1564 /// bits of \a __a.
1565 static __inline__ double __DEFAULT_FN_ATTRS
1567 {
1568  return __a[0];
1569 }
1570 
1571 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1572 /// memory location.
1573 ///
1574 /// \headerfile <x86intrin.h>
1575 ///
1576 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1577 ///
1578 /// \param __dp
1579 /// A pointer to a 128-bit memory location. The address of the memory
1580 /// location has to be 16-byte aligned.
1581 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1582 static __inline__ __m128d __DEFAULT_FN_ATTRS
1583 _mm_load_pd(double const *__dp)
1584 {
1585  return *(const __m128d*)__dp;
1586 }
1587 
1588 /// Loads a double-precision floating-point value from a specified memory
1589 /// location and duplicates it to both vector elements of a 128-bit vector of
1590 /// [2 x double].
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1595 ///
1596 /// \param __dp
1597 /// A pointer to a memory location containing a double-precision value.
1598 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1599 /// duplicated values.
1600 static __inline__ __m128d __DEFAULT_FN_ATTRS
1601 _mm_load1_pd(double const *__dp)
1602 {
1603  struct __mm_load1_pd_struct {
1604  double __u;
1605  } __attribute__((__packed__, __may_alias__));
1606  double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
1607  return __extension__ (__m128d){ __u, __u };
1608 }
1609 
1610 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1611 
1612 /// Loads two double-precision values, in reverse order, from an aligned
1613 /// memory location into a 128-bit vector of [2 x double].
1614 ///
1615 /// \headerfile <x86intrin.h>
1616 ///
1617 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1618 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1619 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1620 ///
1621 /// \param __dp
1622 /// A 16-byte aligned pointer to an array of double-precision values to be
1623 /// loaded in reverse order.
1624 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1625 /// values.
1626 static __inline__ __m128d __DEFAULT_FN_ATTRS
1627 _mm_loadr_pd(double const *__dp)
1628 {
1629  __m128d __u = *(const __m128d*)__dp;
1630  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1631 }
1632 
1633 /// Loads a 128-bit floating-point vector of [2 x double] from an
1634 /// unaligned memory location.
1635 ///
1636 /// \headerfile <x86intrin.h>
1637 ///
1638 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1639 ///
1640 /// \param __dp
1641 /// A pointer to a 128-bit memory location. The address of the memory
1642 /// location does not have to be aligned.
1643 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1644 static __inline__ __m128d __DEFAULT_FN_ATTRS
1645 _mm_loadu_pd(double const *__dp)
1646 {
1647  struct __loadu_pd {
1648  __m128d_u __v;
1649  } __attribute__((__packed__, __may_alias__));
1650  return ((const struct __loadu_pd*)__dp)->__v;
1651 }
1652 
1653 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1654 /// vector and clears the upper element.
1655 ///
1656 /// \headerfile <x86intrin.h>
1657 ///
1658 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1659 ///
1660 /// \param __a
1661 /// A pointer to a 64-bit memory location. The address of the memory
1662 /// location does not have to be aligned.
1663 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1664 static __inline__ __m128i __DEFAULT_FN_ATTRS
1665 _mm_loadu_si64(void const *__a)
1666 {
1667  struct __loadu_si64 {
1668  long long __v;
1669  } __attribute__((__packed__, __may_alias__));
1670  long long __u = ((const struct __loadu_si64*)__a)->__v;
1671  return __extension__ (__m128i)(__v2di){__u, 0LL};
1672 }
1673 
1674 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1675 /// vector and clears the upper element.
1676 ///
1677 /// \headerfile <x86intrin.h>
1678 ///
1679 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1680 ///
1681 /// \param __a
1682 /// A pointer to a 32-bit memory location. The address of the memory
1683 /// location does not have to be aligned.
1684 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1685 static __inline__ __m128i __DEFAULT_FN_ATTRS
1686 _mm_loadu_si32(void const *__a)
1687 {
1688  struct __loadu_si32 {
1689  int __v;
1690  } __attribute__((__packed__, __may_alias__));
1691  int __u = ((const struct __loadu_si32*)__a)->__v;
1692  return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
1693 }
1694 
1695 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1696 /// vector and clears the upper element.
1697 ///
1698 /// \headerfile <x86intrin.h>
1699 ///
1700 /// This intrinsic does not correspond to a specific instruction.
1701 ///
1702 /// \param __a
1703 /// A pointer to a 16-bit memory location. The address of the memory
1704 /// location does not have to be aligned.
1705 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1706 static __inline__ __m128i __DEFAULT_FN_ATTRS
1707 _mm_loadu_si16(void const *__a)
1708 {
1709  struct __loadu_si16 {
1710  short __v;
1711  } __attribute__((__packed__, __may_alias__));
1712  short __u = ((const struct __loadu_si16*)__a)->__v;
1713  return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1714 }
1715 
1716 /// Loads a 64-bit double-precision value to the low element of a
1717 /// 128-bit integer vector and clears the upper element.
1718 ///
1719 /// \headerfile <x86intrin.h>
1720 ///
1721 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1722 ///
1723 /// \param __dp
1724 /// A pointer to a memory location containing a double-precision value.
1725 /// The address of the memory location does not have to be aligned.
1726 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS
1728 _mm_load_sd(double const *__dp)
1729 {
1730  struct __mm_load_sd_struct {
1731  double __u;
1732  } __attribute__((__packed__, __may_alias__));
1733  double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
1734  return __extension__ (__m128d){ __u, 0 };
1735 }
1736 
1737 /// Loads a double-precision value into the high-order bits of a 128-bit
1738 /// vector of [2 x double]. The low-order bits are copied from the low-order
1739 /// bits of the first operand.
1740 ///
1741 /// \headerfile <x86intrin.h>
1742 ///
1743 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1744 ///
1745 /// \param __a
1746 /// A 128-bit vector of [2 x double]. \n
1747 /// Bits [63:0] are written to bits [63:0] of the result.
1748 /// \param __dp
1749 /// A pointer to a 64-bit memory location containing a double-precision
1750 /// floating-point value that is loaded. The loaded value is written to bits
1751 /// [127:64] of the result. The address of the memory location does not have
1752 /// to be aligned.
1753 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1754 static __inline__ __m128d __DEFAULT_FN_ATTRS
1755 _mm_loadh_pd(__m128d __a, double const *__dp)
1756 {
1757  struct __mm_loadh_pd_struct {
1758  double __u;
1759  } __attribute__((__packed__, __may_alias__));
1760  double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
1761  return __extension__ (__m128d){ __a[0], __u };
1762 }
1763 
1764 /// Loads a double-precision value into the low-order bits of a 128-bit
1765 /// vector of [2 x double]. The high-order bits are copied from the
1766 /// high-order bits of the first operand.
1767 ///
1768 /// \headerfile <x86intrin.h>
1769 ///
1770 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1771 ///
1772 /// \param __a
1773 /// A 128-bit vector of [2 x double]. \n
1774 /// Bits [127:64] are written to bits [127:64] of the result.
1775 /// \param __dp
1776 /// A pointer to a 64-bit memory location containing a double-precision
1777 /// floating-point value that is loaded. The loaded value is written to bits
1778 /// [63:0] of the result. The address of the memory location does not have to
1779 /// be aligned.
1780 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1781 static __inline__ __m128d __DEFAULT_FN_ATTRS
1782 _mm_loadl_pd(__m128d __a, double const *__dp)
1783 {
1784  struct __mm_loadl_pd_struct {
1785  double __u;
1786  } __attribute__((__packed__, __may_alias__));
1787  double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
1788  return __extension__ (__m128d){ __u, __a[1] };
1789 }
1790 
1791 /// Constructs a 128-bit floating-point vector of [2 x double] with
1792 /// unspecified content. This could be used as an argument to another
1793 /// intrinsic function where the argument is required but the value is not
1794 /// actually used.
1795 ///
1796 /// \headerfile <x86intrin.h>
1797 ///
1798 /// This intrinsic has no corresponding instruction.
1799 ///
1800 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1801 /// content.
1802 static __inline__ __m128d __DEFAULT_FN_ATTRS
1804 {
1805  return (__m128d)__builtin_ia32_undef128();
1806 }
1807 
1808 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1809 /// 64 bits of the vector are initialized with the specified double-precision
1810 /// floating-point value. The upper 64 bits are set to zero.
1811 ///
1812 /// \headerfile <x86intrin.h>
1813 ///
1814 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1815 ///
1816 /// \param __w
1817 /// A double-precision floating-point value used to initialize the lower 64
1818 /// bits of the result.
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1820 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1821 /// set to zero.
1822 static __inline__ __m128d __DEFAULT_FN_ATTRS
1823 _mm_set_sd(double __w)
1824 {
1825  return __extension__ (__m128d){ __w, 0 };
1826 }
1827 
1828 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1829 /// of the two double-precision floating-point vector elements set to the
1830 /// specified double-precision floating-point value.
1831 ///
1832 /// \headerfile <x86intrin.h>
1833 ///
1834 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1835 ///
1836 /// \param __w
1837 /// A double-precision floating-point value used to initialize each vector
1838 /// element of the result.
1839 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS
1841 _mm_set1_pd(double __w)
1842 {
1843  return __extension__ (__m128d){ __w, __w };
1844 }
1845 
1846 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1847 /// of the two double-precision floating-point vector elements set to the
1848 /// specified double-precision floating-point value.
1849 ///
1850 /// \headerfile <x86intrin.h>
1851 ///
1852 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1853 ///
1854 /// \param __w
1855 /// A double-precision floating-point value used to initialize each vector
1856 /// element of the result.
1857 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1858 static __inline__ __m128d __DEFAULT_FN_ATTRS
1859 _mm_set_pd1(double __w)
1860 {
1861  return _mm_set1_pd(__w);
1862 }
1863 
1864 /// Constructs a 128-bit floating-point vector of [2 x double]
1865 /// initialized with the specified double-precision floating-point values.
1866 ///
1867 /// \headerfile <x86intrin.h>
1868 ///
1869 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1870 ///
1871 /// \param __w
1872 /// A double-precision floating-point value used to initialize the upper 64
1873 /// bits of the result.
1874 /// \param __x
1875 /// A double-precision floating-point value used to initialize the lower 64
1876 /// bits of the result.
1877 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1878 static __inline__ __m128d __DEFAULT_FN_ATTRS
1879 _mm_set_pd(double __w, double __x)
1880 {
1881  return __extension__ (__m128d){ __x, __w };
1882 }
1883 
1884 /// Constructs a 128-bit floating-point vector of [2 x double],
1885 /// initialized in reverse order with the specified double-precision
1886 /// floating-point values.
1887 ///
1888 /// \headerfile <x86intrin.h>
1889 ///
1890 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1891 ///
1892 /// \param __w
1893 /// A double-precision floating-point value used to initialize the lower 64
1894 /// bits of the result.
1895 /// \param __x
1896 /// A double-precision floating-point value used to initialize the upper 64
1897 /// bits of the result.
1898 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1899 static __inline__ __m128d __DEFAULT_FN_ATTRS
1900 _mm_setr_pd(double __w, double __x)
1901 {
1902  return __extension__ (__m128d){ __w, __x };
1903 }
1904 
1905 /// Constructs a 128-bit floating-point vector of [2 x double]
1906 /// initialized to zero.
1907 ///
1908 /// \headerfile <x86intrin.h>
1909 ///
1910 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1911 ///
1912 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1913 /// all elements set to zero.
1914 static __inline__ __m128d __DEFAULT_FN_ATTRS
1916 {
1917  return __extension__ (__m128d){ 0, 0 };
1918 }
1919 
1920 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1921 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1922 /// 64 bits are set to the upper 64 bits of the first parameter.
1923 ///
1924 /// \headerfile <x86intrin.h>
1925 ///
1926 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1927 ///
1928 /// \param __a
1929 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1930 /// upper 64 bits of the result.
1931 /// \param __b
1932 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1933 /// lower 64 bits of the result.
1934 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1935 static __inline__ __m128d __DEFAULT_FN_ATTRS
1936 _mm_move_sd(__m128d __a, __m128d __b)
1937 {
1938  __a[0] = __b[0];
1939  return __a;
1940 }
1941 
1942 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1943 /// memory location.
1944 ///
1945 /// \headerfile <x86intrin.h>
1946 ///
1947 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1948 ///
1949 /// \param __dp
1950 /// A pointer to a 64-bit memory location.
1951 /// \param __a
1952 /// A 128-bit vector of [2 x double] containing the value to be stored.
1953 static __inline__ void __DEFAULT_FN_ATTRS
1954 _mm_store_sd(double *__dp, __m128d __a)
1955 {
1956  struct __mm_store_sd_struct {
1957  double __u;
1958  } __attribute__((__packed__, __may_alias__));
1959  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1960 }
1961 
1962 /// Moves packed double-precision values from a 128-bit vector of
1963 /// [2 x double] to a memory location.
1964 ///
1965 /// \headerfile <x86intrin.h>
1966 ///
1967 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1968 ///
1969 /// \param __dp
1970 /// A pointer to an aligned memory location that can store two
1971 /// double-precision values.
1972 /// \param __a
1973 /// A packed 128-bit vector of [2 x double] containing the values to be
1974 /// moved.
1975 static __inline__ void __DEFAULT_FN_ATTRS
1976 _mm_store_pd(double *__dp, __m128d __a)
1977 {
1978  *(__m128d*)__dp = __a;
1979 }
1980 
1981 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1982 /// the upper and lower 64 bits of a memory location.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic corresponds to the
1987 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1988 ///
1989 /// \param __dp
1990 /// A pointer to a memory location that can store two double-precision
1991 /// values.
1992 /// \param __a
1993 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1994 /// of the values in \a __dp.
1995 static __inline__ void __DEFAULT_FN_ATTRS
1996 _mm_store1_pd(double *__dp, __m128d __a)
1997 {
1998  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1999  _mm_store_pd(__dp, __a);
2000 }
2001 
2002 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
2003 /// the upper and lower 64 bits of a memory location.
2004 ///
2005 /// \headerfile <x86intrin.h>
2006 ///
2007 /// This intrinsic corresponds to the
2008 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
2009 ///
2010 /// \param __dp
2011 /// A pointer to a memory location that can store two double-precision
2012 /// values.
2013 /// \param __a
2014 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2015 /// of the values in \a __dp.
2016 static __inline__ void __DEFAULT_FN_ATTRS
2017 _mm_store_pd1(double *__dp, __m128d __a)
2018 {
2019  _mm_store1_pd(__dp, __a);
2020 }
2021 
2022 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
2023 /// location.
2024 ///
2025 /// \headerfile <x86intrin.h>
2026 ///
2027 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
2028 ///
2029 /// \param __dp
2030 /// A pointer to a 128-bit memory location. The address of the memory
2031 /// location does not have to be aligned.
2032 /// \param __a
2033 /// A 128-bit vector of [2 x double] containing the values to be stored.
2034 static __inline__ void __DEFAULT_FN_ATTRS
2035 _mm_storeu_pd(double *__dp, __m128d __a)
2036 {
2037  struct __storeu_pd {
2038  __m128d_u __v;
2039  } __attribute__((__packed__, __may_alias__));
2040  ((struct __storeu_pd*)__dp)->__v = __a;
2041 }
2042 
2043 /// Stores two double-precision values, in reverse order, from a 128-bit
2044 /// vector of [2 x double] to a 16-byte aligned memory location.
2045 ///
2046 /// \headerfile <x86intrin.h>
2047 ///
2048 /// This intrinsic corresponds to a shuffling instruction followed by a
2049 /// <c> VMOVAPD / MOVAPD </c> instruction.
2050 ///
2051 /// \param __dp
2052 /// A pointer to a 16-byte aligned memory location that can store two
2053 /// double-precision values.
2054 /// \param __a
2055 /// A 128-bit vector of [2 x double] containing the values to be reversed and
2056 /// stored.
2057 static __inline__ void __DEFAULT_FN_ATTRS
2058 _mm_storer_pd(double *__dp, __m128d __a)
2059 {
2060  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2061  *(__m128d *)__dp = __a;
2062 }
2063 
2064 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2065 /// memory location.
2066 ///
2067 /// \headerfile <x86intrin.h>
2068 ///
2069 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2070 ///
2071 /// \param __dp
2072 /// A pointer to a 64-bit memory location.
2073 /// \param __a
2074 /// A 128-bit vector of [2 x double] containing the value to be stored.
2075 static __inline__ void __DEFAULT_FN_ATTRS
2076 _mm_storeh_pd(double *__dp, __m128d __a)
2077 {
2078  struct __mm_storeh_pd_struct {
2079  double __u;
2080  } __attribute__((__packed__, __may_alias__));
2081  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2082 }
2083 
2084 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2085 /// memory location.
2086 ///
2087 /// \headerfile <x86intrin.h>
2088 ///
2089 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2090 ///
2091 /// \param __dp
2092 /// A pointer to a 64-bit memory location.
2093 /// \param __a
2094 /// A 128-bit vector of [2 x double] containing the value to be stored.
2095 static __inline__ void __DEFAULT_FN_ATTRS
2096 _mm_storel_pd(double *__dp, __m128d __a)
2097 {
2098  struct __mm_storeh_pd_struct {
2099  double __u;
2100  } __attribute__((__packed__, __may_alias__));
2101  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2102 }
2103 
2104 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2105 /// saving the lower 8 bits of each sum in the corresponding element of a
2106 /// 128-bit result vector of [16 x i8].
2107 ///
2108 /// The integer elements of both parameters can be either signed or unsigned.
2109 ///
2110 /// \headerfile <x86intrin.h>
2111 ///
2112 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2113 ///
2114 /// \param __a
2115 /// A 128-bit vector of [16 x i8].
2116 /// \param __b
2117 /// A 128-bit vector of [16 x i8].
2118 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2119 /// parameters.
2120 static __inline__ __m128i __DEFAULT_FN_ATTRS
2121 _mm_add_epi8(__m128i __a, __m128i __b)
2122 {
2123  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2124 }
2125 
2126 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2127 /// saving the lower 16 bits of each sum in the corresponding element of a
2128 /// 128-bit result vector of [8 x i16].
2129 ///
2130 /// The integer elements of both parameters can be either signed or unsigned.
2131 ///
2132 /// \headerfile <x86intrin.h>
2133 ///
2134 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2135 ///
2136 /// \param __a
2137 /// A 128-bit vector of [8 x i16].
2138 /// \param __b
2139 /// A 128-bit vector of [8 x i16].
2140 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2141 /// parameters.
2142 static __inline__ __m128i __DEFAULT_FN_ATTRS
2143 _mm_add_epi16(__m128i __a, __m128i __b)
2144 {
2145  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2146 }
2147 
2148 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2149 /// saving the lower 32 bits of each sum in the corresponding element of a
2150 /// 128-bit result vector of [4 x i32].
2151 ///
2152 /// The integer elements of both parameters can be either signed or unsigned.
2153 ///
2154 /// \headerfile <x86intrin.h>
2155 ///
2156 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2157 ///
2158 /// \param __a
2159 /// A 128-bit vector of [4 x i32].
2160 /// \param __b
2161 /// A 128-bit vector of [4 x i32].
2162 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2163 /// parameters.
2164 static __inline__ __m128i __DEFAULT_FN_ATTRS
2165 _mm_add_epi32(__m128i __a, __m128i __b)
2166 {
2167  return (__m128i)((__v4su)__a + (__v4su)__b);
2168 }
2169 
2170 /// Adds two signed or unsigned 64-bit integer values, returning the
2171 /// lower 64 bits of the sum.
2172 ///
2173 /// \headerfile <x86intrin.h>
2174 ///
2175 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2176 ///
2177 /// \param __a
2178 /// A 64-bit integer.
2179 /// \param __b
2180 /// A 64-bit integer.
2181 /// \returns A 64-bit integer containing the sum of both parameters.
2182 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2183 _mm_add_si64(__m64 __a, __m64 __b)
2184 {
2185  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2186 }
2187 
2188 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2189 /// saving the lower 64 bits of each sum in the corresponding element of a
2190 /// 128-bit result vector of [2 x i64].
2191 ///
2192 /// The integer elements of both parameters can be either signed or unsigned.
2193 ///
2194 /// \headerfile <x86intrin.h>
2195 ///
2196 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2197 ///
2198 /// \param __a
2199 /// A 128-bit vector of [2 x i64].
2200 /// \param __b
2201 /// A 128-bit vector of [2 x i64].
2202 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2203 /// parameters.
2204 static __inline__ __m128i __DEFAULT_FN_ATTRS
2205 _mm_add_epi64(__m128i __a, __m128i __b)
2206 {
2207  return (__m128i)((__v2du)__a + (__v2du)__b);
2208 }
2209 
2210 /// Adds, with saturation, the corresponding elements of two 128-bit
2211 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2212 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2213 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2214 ///
2215 /// \headerfile <x86intrin.h>
2216 ///
2217 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2218 ///
2219 /// \param __a
2220 /// A 128-bit signed [16 x i8] vector.
2221 /// \param __b
2222 /// A 128-bit signed [16 x i8] vector.
2223 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2224 /// both parameters.
2225 static __inline__ __m128i __DEFAULT_FN_ATTRS
2226 _mm_adds_epi8(__m128i __a, __m128i __b)
2227 {
2228  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2229 }
2230 
2231 /// Adds, with saturation, the corresponding elements of two 128-bit
2232 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2233 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2234 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2235 /// 0x8000.
2236 ///
2237 /// \headerfile <x86intrin.h>
2238 ///
2239 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2240 ///
2241 /// \param __a
2242 /// A 128-bit signed [8 x i16] vector.
2243 /// \param __b
2244 /// A 128-bit signed [8 x i16] vector.
2245 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2246 /// both parameters.
2247 static __inline__ __m128i __DEFAULT_FN_ATTRS
2248 _mm_adds_epi16(__m128i __a, __m128i __b)
2249 {
2250  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2251 }
2252 
2253 /// Adds, with saturation, the corresponding elements of two 128-bit
2254 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2255 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2256 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2257 ///
2258 /// \headerfile <x86intrin.h>
2259 ///
2260 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2261 ///
2262 /// \param __a
2263 /// A 128-bit unsigned [16 x i8] vector.
2264 /// \param __b
2265 /// A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2267 /// of both parameters.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS
2269 _mm_adds_epu8(__m128i __a, __m128i __b)
2270 {
2271  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2272 }
2273 
2274 /// Adds, with saturation, the corresponding elements of two 128-bit
2275 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2276 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2277 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2278 ///
2279 /// \headerfile <x86intrin.h>
2280 ///
2281 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2282 ///
2283 /// \param __a
2284 /// A 128-bit unsigned [8 x i16] vector.
2285 /// \param __b
2286 /// A 128-bit unsigned [8 x i16] vector.
2287 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2288 /// of both parameters.
2289 static __inline__ __m128i __DEFAULT_FN_ATTRS
2290 _mm_adds_epu16(__m128i __a, __m128i __b)
2291 {
2292  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2293 }
2294 
2295 /// Computes the rounded averages of corresponding elements of two
2296 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2297 /// corresponding element of a 128-bit result vector of [16 x i8].
2298 ///
2299 /// \headerfile <x86intrin.h>
2300 ///
2301 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2302 ///
2303 /// \param __a
2304 /// A 128-bit unsigned [16 x i8] vector.
2305 /// \param __b
2306 /// A 128-bit unsigned [16 x i8] vector.
2307 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2308 /// averages of both parameters.
2309 static __inline__ __m128i __DEFAULT_FN_ATTRS
2310 _mm_avg_epu8(__m128i __a, __m128i __b)
2311 {
2312  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2313 }
2314 
2315 /// Computes the rounded averages of corresponding elements of two
2316 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2317 /// corresponding element of a 128-bit result vector of [8 x i16].
2318 ///
2319 /// \headerfile <x86intrin.h>
2320 ///
2321 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2322 ///
2323 /// \param __a
2324 /// A 128-bit unsigned [8 x i16] vector.
2325 /// \param __b
2326 /// A 128-bit unsigned [8 x i16] vector.
2327 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2328 /// averages of both parameters.
2329 static __inline__ __m128i __DEFAULT_FN_ATTRS
2330 _mm_avg_epu16(__m128i __a, __m128i __b)
2331 {
2332  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2333 }
2334 
2335 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2336 /// vectors, producing eight intermediate 32-bit signed integer products, and
2337 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2338 /// [4 x i32] vector.
2339 ///
2340 /// For example, bits [15:0] of both parameters are multiplied producing a
2341 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2342 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2343 /// of the result.
2344 ///
2345 /// \headerfile <x86intrin.h>
2346 ///
2347 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2348 ///
2349 /// \param __a
2350 /// A 128-bit signed [8 x i16] vector.
2351 /// \param __b
2352 /// A 128-bit signed [8 x i16] vector.
2353 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2354 /// of both parameters.
2355 static __inline__ __m128i __DEFAULT_FN_ATTRS
2356 _mm_madd_epi16(__m128i __a, __m128i __b)
2357 {
2358  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2359 }
2360 
2361 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2362 /// vectors, saving the greater value from each comparison in the
2363 /// corresponding element of a 128-bit result vector of [8 x i16].
2364 ///
2365 /// \headerfile <x86intrin.h>
2366 ///
2367 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2368 ///
2369 /// \param __a
2370 /// A 128-bit signed [8 x i16] vector.
2371 /// \param __b
2372 /// A 128-bit signed [8 x i16] vector.
2373 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2374 /// each comparison.
2375 static __inline__ __m128i __DEFAULT_FN_ATTRS
2376 _mm_max_epi16(__m128i __a, __m128i __b)
2377 {
2378  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2379 }
2380 
2381 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2382 /// vectors, saving the greater value from each comparison in the
2383 /// corresponding element of a 128-bit result vector of [16 x i8].
2384 ///
2385 /// \headerfile <x86intrin.h>
2386 ///
2387 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2388 ///
2389 /// \param __a
2390 /// A 128-bit unsigned [16 x i8] vector.
2391 /// \param __b
2392 /// A 128-bit unsigned [16 x i8] vector.
2393 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2394 /// each comparison.
2395 static __inline__ __m128i __DEFAULT_FN_ATTRS
2396 _mm_max_epu8(__m128i __a, __m128i __b)
2397 {
2398  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2399 }
2400 
2401 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2402 /// vectors, saving the smaller value from each comparison in the
2403 /// corresponding element of a 128-bit result vector of [8 x i16].
2404 ///
2405 /// \headerfile <x86intrin.h>
2406 ///
2407 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2408 ///
2409 /// \param __a
2410 /// A 128-bit signed [8 x i16] vector.
2411 /// \param __b
2412 /// A 128-bit signed [8 x i16] vector.
2413 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2414 /// each comparison.
2415 static __inline__ __m128i __DEFAULT_FN_ATTRS
2416 _mm_min_epi16(__m128i __a, __m128i __b)
2417 {
2418  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2419 }
2420 
2421 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2422 /// vectors, saving the smaller value from each comparison in the
2423 /// corresponding element of a 128-bit result vector of [16 x i8].
2424 ///
2425 /// \headerfile <x86intrin.h>
2426 ///
2427 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2428 ///
2429 /// \param __a
2430 /// A 128-bit unsigned [16 x i8] vector.
2431 /// \param __b
2432 /// A 128-bit unsigned [16 x i8] vector.
2433 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2434 /// each comparison.
2435 static __inline__ __m128i __DEFAULT_FN_ATTRS
2436 _mm_min_epu8(__m128i __a, __m128i __b)
2437 {
2438  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2439 }
2440 
2441 /// Multiplies the corresponding elements of two signed [8 x i16]
2442 /// vectors, saving the upper 16 bits of each 32-bit product in the
2443 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2444 ///
2445 /// \headerfile <x86intrin.h>
2446 ///
2447 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2448 ///
2449 /// \param __a
2450 /// A 128-bit signed [8 x i16] vector.
2451 /// \param __b
2452 /// A 128-bit signed [8 x i16] vector.
2453 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2454 /// each of the eight 32-bit products.
2455 static __inline__ __m128i __DEFAULT_FN_ATTRS
2456 _mm_mulhi_epi16(__m128i __a, __m128i __b)
2457 {
2458  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2459 }
2460 
2461 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2462 /// vectors, saving the upper 16 bits of each 32-bit product in the
2463 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2464 ///
2465 /// \headerfile <x86intrin.h>
2466 ///
2467 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2468 ///
2469 /// \param __a
2470 /// A 128-bit unsigned [8 x i16] vector.
2471 /// \param __b
2472 /// A 128-bit unsigned [8 x i16] vector.
2473 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2474 /// of each of the eight 32-bit products.
2475 static __inline__ __m128i __DEFAULT_FN_ATTRS
2476 _mm_mulhi_epu16(__m128i __a, __m128i __b)
2477 {
2478  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2479 }
2480 
2481 /// Multiplies the corresponding elements of two signed [8 x i16]
2482 /// vectors, saving the lower 16 bits of each 32-bit product in the
2483 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2484 ///
2485 /// \headerfile <x86intrin.h>
2486 ///
2487 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2488 ///
2489 /// \param __a
2490 /// A 128-bit signed [8 x i16] vector.
2491 /// \param __b
2492 /// A 128-bit signed [8 x i16] vector.
2493 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2494 /// each of the eight 32-bit products.
2495 static __inline__ __m128i __DEFAULT_FN_ATTRS
2496 _mm_mullo_epi16(__m128i __a, __m128i __b)
2497 {
2498  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2499 }
2500 
2501 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2502 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2503 /// product.
2504 ///
2505 /// \headerfile <x86intrin.h>
2506 ///
2507 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2508 ///
2509 /// \param __a
2510 /// A 64-bit integer containing one of the source operands.
2511 /// \param __b
2512 /// A 64-bit integer containing one of the source operands.
2513 /// \returns A 64-bit integer vector containing the product of both operands.
2514 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2515 _mm_mul_su32(__m64 __a, __m64 __b)
2516 {
2517  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2518 }
2519 
2520 /// Multiplies 32-bit unsigned integer values contained in the lower
2521 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2522 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2523 ///
2524 /// \headerfile <x86intrin.h>
2525 ///
2526 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2527 ///
2528 /// \param __a
2529 /// A [2 x i64] vector containing one of the source operands.
2530 /// \param __b
2531 /// A [2 x i64] vector containing one of the source operands.
2532 /// \returns A [2 x i64] vector containing the product of both operands.
2533 static __inline__ __m128i __DEFAULT_FN_ATTRS
2534 _mm_mul_epu32(__m128i __a, __m128i __b)
2535 {
2536  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2537 }
2538 
2539 /// Computes the absolute differences of corresponding 8-bit integer
2540 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2541 /// separately sums the second 8 absolute differences. Packs these two
2542 /// unsigned 16-bit integer sums into the upper and lower elements of a
2543 /// [2 x i64] vector.
2544 ///
2545 /// \headerfile <x86intrin.h>
2546 ///
2547 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2548 ///
2549 /// \param __a
2550 /// A 128-bit integer vector containing one of the source operands.
2551 /// \param __b
2552 /// A 128-bit integer vector containing one of the source operands.
2553 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2554 /// differences between both operands.
2555 static __inline__ __m128i __DEFAULT_FN_ATTRS
2556 _mm_sad_epu8(__m128i __a, __m128i __b)
2557 {
2558  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2559 }
2560 
2561 /// Subtracts the corresponding 8-bit integer values in the operands.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2566 ///
2567 /// \param __a
2568 /// A 128-bit integer vector containing the minuends.
2569 /// \param __b
2570 /// A 128-bit integer vector containing the subtrahends.
2571 /// \returns A 128-bit integer vector containing the differences of the values
2572 /// in the operands.
2573 static __inline__ __m128i __DEFAULT_FN_ATTRS
2574 _mm_sub_epi8(__m128i __a, __m128i __b)
2575 {
2576  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2577 }
2578 
2579 /// Subtracts the corresponding 16-bit integer values in the operands.
2580 ///
2581 /// \headerfile <x86intrin.h>
2582 ///
2583 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2584 ///
2585 /// \param __a
2586 /// A 128-bit integer vector containing the minuends.
2587 /// \param __b
2588 /// A 128-bit integer vector containing the subtrahends.
2589 /// \returns A 128-bit integer vector containing the differences of the values
2590 /// in the operands.
2591 static __inline__ __m128i __DEFAULT_FN_ATTRS
2592 _mm_sub_epi16(__m128i __a, __m128i __b)
2593 {
2594  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2595 }
2596 
2597 /// Subtracts the corresponding 32-bit integer values in the operands.
2598 ///
2599 /// \headerfile <x86intrin.h>
2600 ///
2601 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2602 ///
2603 /// \param __a
2604 /// A 128-bit integer vector containing the minuends.
2605 /// \param __b
2606 /// A 128-bit integer vector containing the subtrahends.
2607 /// \returns A 128-bit integer vector containing the differences of the values
2608 /// in the operands.
2609 static __inline__ __m128i __DEFAULT_FN_ATTRS
2610 _mm_sub_epi32(__m128i __a, __m128i __b)
2611 {
2612  return (__m128i)((__v4su)__a - (__v4su)__b);
2613 }
2614 
2615 /// Subtracts signed or unsigned 64-bit integer values and writes the
2616 /// difference to the corresponding bits in the destination.
2617 ///
2618 /// \headerfile <x86intrin.h>
2619 ///
2620 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2621 ///
2622 /// \param __a
2623 /// A 64-bit integer vector containing the minuend.
2624 /// \param __b
2625 /// A 64-bit integer vector containing the subtrahend.
2626 /// \returns A 64-bit integer vector containing the difference of the values in
2627 /// the operands.
2628 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2629 _mm_sub_si64(__m64 __a, __m64 __b)
2630 {
2631  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2632 }
2633 
2634 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2635 ///
2636 /// \headerfile <x86intrin.h>
2637 ///
2638 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2639 ///
2640 /// \param __a
2641 /// A 128-bit integer vector containing the minuends.
2642 /// \param __b
2643 /// A 128-bit integer vector containing the subtrahends.
2644 /// \returns A 128-bit integer vector containing the differences of the values
2645 /// in the operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS
2647 _mm_sub_epi64(__m128i __a, __m128i __b)
2648 {
2649  return (__m128i)((__v2du)__a - (__v2du)__b);
2650 }
2651 
2652 /// Subtracts corresponding 8-bit signed integer values in the input and
2653 /// returns the differences in the corresponding bytes in the destination.
2654 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2655 /// than 0x80 are saturated to 0x80.
2656 ///
2657 /// \headerfile <x86intrin.h>
2658 ///
2659 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2660 ///
2661 /// \param __a
2662 /// A 128-bit integer vector containing the minuends.
2663 /// \param __b
2664 /// A 128-bit integer vector containing the subtrahends.
2665 /// \returns A 128-bit integer vector containing the differences of the values
2666 /// in the operands.
2667 static __inline__ __m128i __DEFAULT_FN_ATTRS
2668 _mm_subs_epi8(__m128i __a, __m128i __b)
2669 {
2670  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2671 }
2672 
2673 /// Subtracts corresponding 16-bit signed integer values in the input and
2674 /// returns the differences in the corresponding bytes in the destination.
2675 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2676 /// than 0x8000 are saturated to 0x8000.
2677 ///
2678 /// \headerfile <x86intrin.h>
2679 ///
2680 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2681 ///
2682 /// \param __a
2683 /// A 128-bit integer vector containing the minuends.
2684 /// \param __b
2685 /// A 128-bit integer vector containing the subtrahends.
2686 /// \returns A 128-bit integer vector containing the differences of the values
2687 /// in the operands.
2688 static __inline__ __m128i __DEFAULT_FN_ATTRS
2689 _mm_subs_epi16(__m128i __a, __m128i __b)
2690 {
2691  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2692 }
2693 
2694 /// Subtracts corresponding 8-bit unsigned integer values in the input
2695 /// and returns the differences in the corresponding bytes in the
2696 /// destination. Differences less than 0x00 are saturated to 0x00.
2697 ///
2698 /// \headerfile <x86intrin.h>
2699 ///
2700 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2701 ///
2702 /// \param __a
2703 /// A 128-bit integer vector containing the minuends.
2704 /// \param __b
2705 /// A 128-bit integer vector containing the subtrahends.
2706 /// \returns A 128-bit integer vector containing the unsigned integer
2707 /// differences of the values in the operands.
2708 static __inline__ __m128i __DEFAULT_FN_ATTRS
2709 _mm_subs_epu8(__m128i __a, __m128i __b)
2710 {
2711  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2712 }
2713 
2714 /// Subtracts corresponding 16-bit unsigned integer values in the input
2715 /// and returns the differences in the corresponding bytes in the
2716 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2717 ///
2718 /// \headerfile <x86intrin.h>
2719 ///
2720 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2721 ///
2722 /// \param __a
2723 /// A 128-bit integer vector containing the minuends.
2724 /// \param __b
2725 /// A 128-bit integer vector containing the subtrahends.
2726 /// \returns A 128-bit integer vector containing the unsigned integer
2727 /// differences of the values in the operands.
2728 static __inline__ __m128i __DEFAULT_FN_ATTRS
2729 _mm_subs_epu16(__m128i __a, __m128i __b)
2730 {
2731  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2732 }
2733 
2734 /// Performs a bitwise AND of two 128-bit integer vectors.
2735 ///
2736 /// \headerfile <x86intrin.h>
2737 ///
2738 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2739 ///
2740 /// \param __a
2741 /// A 128-bit integer vector containing one of the source operands.
2742 /// \param __b
2743 /// A 128-bit integer vector containing one of the source operands.
2744 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2745 /// in both operands.
2746 static __inline__ __m128i __DEFAULT_FN_ATTRS
2747 _mm_and_si128(__m128i __a, __m128i __b)
2748 {
2749  return (__m128i)((__v2du)__a & (__v2du)__b);
2750 }
2751 
2752 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2753 /// one's complement of the values contained in the first source operand.
2754 ///
2755 /// \headerfile <x86intrin.h>
2756 ///
2757 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2758 ///
2759 /// \param __a
2760 /// A 128-bit vector containing the left source operand. The one's complement
2761 /// of this value is used in the bitwise AND.
2762 /// \param __b
2763 /// A 128-bit vector containing the right source operand.
2764 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2765 /// complement of the first operand and the values in the second operand.
2766 static __inline__ __m128i __DEFAULT_FN_ATTRS
2767 _mm_andnot_si128(__m128i __a, __m128i __b)
2768 {
2769  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2770 }
2771 /// Performs a bitwise OR of two 128-bit integer vectors.
2772 ///
2773 /// \headerfile <x86intrin.h>
2774 ///
2775 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2776 ///
2777 /// \param __a
2778 /// A 128-bit integer vector containing one of the source operands.
2779 /// \param __b
2780 /// A 128-bit integer vector containing one of the source operands.
2781 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2782 /// in both operands.
2783 static __inline__ __m128i __DEFAULT_FN_ATTRS
2784 _mm_or_si128(__m128i __a, __m128i __b)
2785 {
2786  return (__m128i)((__v2du)__a | (__v2du)__b);
2787 }
2788 
2789 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2794 ///
2795 /// \param __a
2796 /// A 128-bit integer vector containing one of the source operands.
2797 /// \param __b
2798 /// A 128-bit integer vector containing one of the source operands.
2799 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2800 /// values in both operands.
2801 static __inline__ __m128i __DEFAULT_FN_ATTRS
2802 _mm_xor_si128(__m128i __a, __m128i __b)
2803 {
2804  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2805 }
2806 
2807 /// Left-shifts the 128-bit integer vector operand by the specified
2808 /// number of bytes. Low-order bits are cleared.
2809 ///
2810 /// \headerfile <x86intrin.h>
2811 ///
2812 /// \code
2813 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2814 /// \endcode
2815 ///
2816 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2817 ///
2818 /// \param a
2819 /// A 128-bit integer vector containing the source operand.
2820 /// \param imm
2821 /// An immediate value specifying the number of bytes to left-shift operand
2822 /// \a a.
2823 /// \returns A 128-bit integer vector containing the left-shifted value.
2824 #define _mm_slli_si128(a, imm) \
2825  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
2826 
2827 #define _mm_bslli_si128(a, imm) \
2828  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
2829 
2830 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2831 /// by the specified number of bits. Low-order bits are cleared.
2832 ///
2833 /// \headerfile <x86intrin.h>
2834 ///
2835 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2836 ///
2837 /// \param __a
2838 /// A 128-bit integer vector containing the source operand.
2839 /// \param __count
2840 /// An integer value specifying the number of bits to left-shift each value
2841 /// in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the left-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS
2844 _mm_slli_epi16(__m128i __a, int __count)
2845 {
2846  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2847 }
2848 
2849 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2850 /// by the specified number of bits. Low-order bits are cleared.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2855 ///
2856 /// \param __a
2857 /// A 128-bit integer vector containing the source operand.
2858 /// \param __count
2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 /// to left-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the left-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS
2863 _mm_sll_epi16(__m128i __a, __m128i __count)
2864 {
2865  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2866 }
2867 
2868 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2869 /// by the specified number of bits. Low-order bits are cleared.
2870 ///
2871 /// \headerfile <x86intrin.h>
2872 ///
2873 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2874 ///
2875 /// \param __a
2876 /// A 128-bit integer vector containing the source operand.
2877 /// \param __count
2878 /// An integer value specifying the number of bits to left-shift each value
2879 /// in operand \a __a.
2880 /// \returns A 128-bit integer vector containing the left-shifted values.
2881 static __inline__ __m128i __DEFAULT_FN_ATTRS
2882 _mm_slli_epi32(__m128i __a, int __count)
2883 {
2884  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2885 }
2886 
2887 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2888 /// by the specified number of bits. Low-order bits are cleared.
2889 ///
2890 /// \headerfile <x86intrin.h>
2891 ///
2892 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2893 ///
2894 /// \param __a
2895 /// A 128-bit integer vector containing the source operand.
2896 /// \param __count
2897 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2898 /// to left-shift each value in operand \a __a.
2899 /// \returns A 128-bit integer vector containing the left-shifted values.
2900 static __inline__ __m128i __DEFAULT_FN_ATTRS
2901 _mm_sll_epi32(__m128i __a, __m128i __count)
2902 {
2903  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2904 }
2905 
2906 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2907 /// by the specified number of bits. Low-order bits are cleared.
2908 ///
2909 /// \headerfile <x86intrin.h>
2910 ///
2911 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2912 ///
2913 /// \param __a
2914 /// A 128-bit integer vector containing the source operand.
2915 /// \param __count
2916 /// An integer value specifying the number of bits to left-shift each value
2917 /// in operand \a __a.
2918 /// \returns A 128-bit integer vector containing the left-shifted values.
2919 static __inline__ __m128i __DEFAULT_FN_ATTRS
2920 _mm_slli_epi64(__m128i __a, int __count)
2921 {
2922  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2923 }
2924 
2925 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2926 /// by the specified number of bits. Low-order bits are cleared.
2927 ///
2928 /// \headerfile <x86intrin.h>
2929 ///
2930 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2931 ///
2932 /// \param __a
2933 /// A 128-bit integer vector containing the source operand.
2934 /// \param __count
2935 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2936 /// to left-shift each value in operand \a __a.
2937 /// \returns A 128-bit integer vector containing the left-shifted values.
2938 static __inline__ __m128i __DEFAULT_FN_ATTRS
2939 _mm_sll_epi64(__m128i __a, __m128i __count)
2940 {
2941  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2942 }
2943 
2944 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2945 /// by the specified number of bits. High-order bits are filled with the sign
2946 /// bit of the initial value.
2947 ///
2948 /// \headerfile <x86intrin.h>
2949 ///
2950 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2951 ///
2952 /// \param __a
2953 /// A 128-bit integer vector containing the source operand.
2954 /// \param __count
2955 /// An integer value specifying the number of bits to right-shift each value
2956 /// in operand \a __a.
2957 /// \returns A 128-bit integer vector containing the right-shifted values.
2958 static __inline__ __m128i __DEFAULT_FN_ATTRS
2959 _mm_srai_epi16(__m128i __a, int __count)
2960 {
2961  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2962 }
2963 
2964 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2965 /// by the specified number of bits. High-order bits are filled with the sign
2966 /// bit of the initial value.
2967 ///
2968 /// \headerfile <x86intrin.h>
2969 ///
2970 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2971 ///
2972 /// \param __a
2973 /// A 128-bit integer vector containing the source operand.
2974 /// \param __count
2975 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2976 /// to right-shift each value in operand \a __a.
2977 /// \returns A 128-bit integer vector containing the right-shifted values.
2978 static __inline__ __m128i __DEFAULT_FN_ATTRS
2979 _mm_sra_epi16(__m128i __a, __m128i __count)
2980 {
2981  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2982 }
2983 
2984 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2985 /// by the specified number of bits. High-order bits are filled with the sign
2986 /// bit of the initial value.
2987 ///
2988 /// \headerfile <x86intrin.h>
2989 ///
2990 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2991 ///
2992 /// \param __a
2993 /// A 128-bit integer vector containing the source operand.
2994 /// \param __count
2995 /// An integer value specifying the number of bits to right-shift each value
2996 /// in operand \a __a.
2997 /// \returns A 128-bit integer vector containing the right-shifted values.
2998 static __inline__ __m128i __DEFAULT_FN_ATTRS
2999 _mm_srai_epi32(__m128i __a, int __count)
3000 {
3001  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
3002 }
3003 
3004 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
3005 /// by the specified number of bits. High-order bits are filled with the sign
3006 /// bit of the initial value.
3007 ///
3008 /// \headerfile <x86intrin.h>
3009 ///
3010 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3011 ///
3012 /// \param __a
3013 /// A 128-bit integer vector containing the source operand.
3014 /// \param __count
3015 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3016 /// to right-shift each value in operand \a __a.
3017 /// \returns A 128-bit integer vector containing the right-shifted values.
3018 static __inline__ __m128i __DEFAULT_FN_ATTRS
3019 _mm_sra_epi32(__m128i __a, __m128i __count)
3020 {
3021  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
3022 }
3023 
3024 /// Right-shifts the 128-bit integer vector operand by the specified
3025 /// number of bytes. High-order bits are cleared.
3026 ///
3027 /// \headerfile <x86intrin.h>
3028 ///
3029 /// \code
3030 /// __m128i _mm_srli_si128(__m128i a, const int imm);
3031 /// \endcode
3032 ///
3033 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3034 ///
3035 /// \param a
3036 /// A 128-bit integer vector containing the source operand.
3037 /// \param imm
3038 /// An immediate value specifying the number of bytes to right-shift operand
3039 /// \a a.
3040 /// \returns A 128-bit integer vector containing the right-shifted value.
3041 #define _mm_srli_si128(a, imm) \
3042  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
3043 
3044 #define _mm_bsrli_si128(a, imm) \
3045  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
3046 
3047 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3048 /// operand by the specified number of bits. High-order bits are cleared.
3049 ///
3050 /// \headerfile <x86intrin.h>
3051 ///
3052 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3053 ///
3054 /// \param __a
3055 /// A 128-bit integer vector containing the source operand.
3056 /// \param __count
3057 /// An integer value specifying the number of bits to right-shift each value
3058 /// in operand \a __a.
3059 /// \returns A 128-bit integer vector containing the right-shifted values.
3060 static __inline__ __m128i __DEFAULT_FN_ATTRS
3061 _mm_srli_epi16(__m128i __a, int __count)
3062 {
3063  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3064 }
3065 
3066 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3067 /// operand by the specified number of bits. High-order bits are cleared.
3068 ///
3069 /// \headerfile <x86intrin.h>
3070 ///
3071 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3072 ///
3073 /// \param __a
3074 /// A 128-bit integer vector containing the source operand.
3075 /// \param __count
3076 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3077 /// to right-shift each value in operand \a __a.
3078 /// \returns A 128-bit integer vector containing the right-shifted values.
3079 static __inline__ __m128i __DEFAULT_FN_ATTRS
3080 _mm_srl_epi16(__m128i __a, __m128i __count)
3081 {
3082  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3083 }
3084 
3085 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3086 /// operand by the specified number of bits. High-order bits are cleared.
3087 ///
3088 /// \headerfile <x86intrin.h>
3089 ///
3090 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3091 ///
3092 /// \param __a
3093 /// A 128-bit integer vector containing the source operand.
3094 /// \param __count
3095 /// An integer value specifying the number of bits to right-shift each value
3096 /// in operand \a __a.
3097 /// \returns A 128-bit integer vector containing the right-shifted values.
3098 static __inline__ __m128i __DEFAULT_FN_ATTRS
3099 _mm_srli_epi32(__m128i __a, int __count)
3100 {
3101  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3102 }
3103 
3104 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3105 /// operand by the specified number of bits. High-order bits are cleared.
3106 ///
3107 /// \headerfile <x86intrin.h>
3108 ///
3109 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3110 ///
3111 /// \param __a
3112 /// A 128-bit integer vector containing the source operand.
3113 /// \param __count
3114 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3115 /// to right-shift each value in operand \a __a.
3116 /// \returns A 128-bit integer vector containing the right-shifted values.
3117 static __inline__ __m128i __DEFAULT_FN_ATTRS
3118 _mm_srl_epi32(__m128i __a, __m128i __count)
3119 {
3120  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3121 }
3122 
3123 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3124 /// operand by the specified number of bits. High-order bits are cleared.
3125 ///
3126 /// \headerfile <x86intrin.h>
3127 ///
3128 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3129 ///
3130 /// \param __a
3131 /// A 128-bit integer vector containing the source operand.
3132 /// \param __count
3133 /// An integer value specifying the number of bits to right-shift each value
3134 /// in operand \a __a.
3135 /// \returns A 128-bit integer vector containing the right-shifted values.
3136 static __inline__ __m128i __DEFAULT_FN_ATTRS
3137 _mm_srli_epi64(__m128i __a, int __count)
3138 {
3139  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3140 }
3141 
3142 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3143 /// operand by the specified number of bits. High-order bits are cleared.
3144 ///
3145 /// \headerfile <x86intrin.h>
3146 ///
3147 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3148 ///
3149 /// \param __a
3150 /// A 128-bit integer vector containing the source operand.
3151 /// \param __count
3152 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3153 /// to right-shift each value in operand \a __a.
3154 /// \returns A 128-bit integer vector containing the right-shifted values.
3155 static __inline__ __m128i __DEFAULT_FN_ATTRS
3156 _mm_srl_epi64(__m128i __a, __m128i __count)
3157 {
3158  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3159 }
3160 
3161 /// Compares each of the corresponding 8-bit values of the 128-bit
3162 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3163 /// for true.
3164 ///
3165 /// \headerfile <x86intrin.h>
3166 ///
3167 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3168 ///
3169 /// \param __a
3170 /// A 128-bit integer vector.
3171 /// \param __b
3172 /// A 128-bit integer vector.
3173 /// \returns A 128-bit integer vector containing the comparison results.
3174 static __inline__ __m128i __DEFAULT_FN_ATTRS
3175 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
3176 {
3177  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3178 }
3179 
3180 /// Compares each of the corresponding 16-bit values of the 128-bit
3181 /// integer vectors for equality. Each comparison yields 0x0 for false,
3182 /// 0xFFFF for true.
3183 ///
3184 /// \headerfile <x86intrin.h>
3185 ///
3186 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3187 ///
3188 /// \param __a
3189 /// A 128-bit integer vector.
3190 /// \param __b
3191 /// A 128-bit integer vector.
3192 /// \returns A 128-bit integer vector containing the comparison results.
3193 static __inline__ __m128i __DEFAULT_FN_ATTRS
3194 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
3195 {
3196  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3197 }
3198 
3199 /// Compares each of the corresponding 32-bit values of the 128-bit
3200 /// integer vectors for equality. Each comparison yields 0x0 for false,
3201 /// 0xFFFFFFFF for true.
3202 ///
3203 /// \headerfile <x86intrin.h>
3204 ///
3205 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3206 ///
3207 /// \param __a
3208 /// A 128-bit integer vector.
3209 /// \param __b
3210 /// A 128-bit integer vector.
3211 /// \returns A 128-bit integer vector containing the comparison results.
3212 static __inline__ __m128i __DEFAULT_FN_ATTRS
3213 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
3214 {
3215  return (__m128i)((__v4si)__a == (__v4si)__b);
3216 }
3217 
3218 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3219 /// integer vectors to determine if the values in the first operand are
3220 /// greater than those in the second operand. Each comparison yields 0x0 for
3221 /// false, 0xFF for true.
3222 ///
3223 /// \headerfile <x86intrin.h>
3224 ///
3225 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3226 ///
3227 /// \param __a
3228 /// A 128-bit integer vector.
3229 /// \param __b
3230 /// A 128-bit integer vector.
3231 /// \returns A 128-bit integer vector containing the comparison results.
3232 static __inline__ __m128i __DEFAULT_FN_ATTRS
3233 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
3234 {
3235  /* This function always performs a signed comparison, but __v16qi is a char
3236  which may be signed or unsigned, so use __v16qs. */
3237  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3238 }
3239 
3240 /// Compares each of the corresponding signed 16-bit values of the
3241 /// 128-bit integer vectors to determine if the values in the first operand
3242 /// are greater than those in the second operand.
3243 ///
3244 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3249 ///
3250 /// \param __a
3251 /// A 128-bit integer vector.
3252 /// \param __b
3253 /// A 128-bit integer vector.
3254 /// \returns A 128-bit integer vector containing the comparison results.
3255 static __inline__ __m128i __DEFAULT_FN_ATTRS
3256 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
3257 {
3258  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3259 }
3260 
3261 /// Compares each of the corresponding signed 32-bit values of the
3262 /// 128-bit integer vectors to determine if the values in the first operand
3263 /// are greater than those in the second operand.
3264 ///
3265 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3266 ///
3267 /// \headerfile <x86intrin.h>
3268 ///
3269 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3270 ///
3271 /// \param __a
3272 /// A 128-bit integer vector.
3273 /// \param __b
3274 /// A 128-bit integer vector.
3275 /// \returns A 128-bit integer vector containing the comparison results.
3276 static __inline__ __m128i __DEFAULT_FN_ATTRS
3277 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
3278 {
3279  return (__m128i)((__v4si)__a > (__v4si)__b);
3280 }
3281 
3282 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3283 /// integer vectors to determine if the values in the first operand are less
3284 /// than those in the second operand.
3285 ///
3286 /// Each comparison yields 0x0 for false, 0xFF for true.
3287 ///
3288 /// \headerfile <x86intrin.h>
3289 ///
3290 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3291 ///
3292 /// \param __a
3293 /// A 128-bit integer vector.
3294 /// \param __b
3295 /// A 128-bit integer vector.
3296 /// \returns A 128-bit integer vector containing the comparison results.
3297 static __inline__ __m128i __DEFAULT_FN_ATTRS
3298 _mm_cmplt_epi8(__m128i __a, __m128i __b)
3299 {
3300  return _mm_cmpgt_epi8(__b, __a);
3301 }
3302 
3303 /// Compares each of the corresponding signed 16-bit values of the
3304 /// 128-bit integer vectors to determine if the values in the first operand
3305 /// are less than those in the second operand.
3306 ///
3307 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3308 ///
3309 /// \headerfile <x86intrin.h>
3310 ///
3311 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3312 ///
3313 /// \param __a
3314 /// A 128-bit integer vector.
3315 /// \param __b
3316 /// A 128-bit integer vector.
3317 /// \returns A 128-bit integer vector containing the comparison results.
3318 static __inline__ __m128i __DEFAULT_FN_ATTRS
3319 _mm_cmplt_epi16(__m128i __a, __m128i __b)
3320 {
3321  return _mm_cmpgt_epi16(__b, __a);
3322 }
3323 
3324 /// Compares each of the corresponding signed 32-bit values of the
3325 /// 128-bit integer vectors to determine if the values in the first operand
3326 /// are less than those in the second operand.
3327 ///
3328 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3329 ///
3330 /// \headerfile <x86intrin.h>
3331 ///
3332 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3333 ///
3334 /// \param __a
3335 /// A 128-bit integer vector.
3336 /// \param __b
3337 /// A 128-bit integer vector.
3338 /// \returns A 128-bit integer vector containing the comparison results.
3339 static __inline__ __m128i __DEFAULT_FN_ATTRS
3340 _mm_cmplt_epi32(__m128i __a, __m128i __b)
3341 {
3342  return _mm_cmpgt_epi32(__b, __a);
3343 }
3344 
3345 #ifdef __x86_64__
3346 /// Converts a 64-bit signed integer value from the second operand into a
3347 /// double-precision value and returns it in the lower element of a [2 x
3348 /// double] vector; the upper element of the returned vector is copied from
3349 /// the upper element of the first operand.
3350 ///
3351 /// \headerfile <x86intrin.h>
3352 ///
3353 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3354 ///
3355 /// \param __a
3356 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3357 /// copied to the upper 64 bits of the destination.
3358 /// \param __b
3359 /// A 64-bit signed integer operand containing the value to be converted.
3360 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3361 /// converted value of the second operand. The upper 64 bits are copied from
3362 /// the upper 64 bits of the first operand.
3363 static __inline__ __m128d __DEFAULT_FN_ATTRS
3364 _mm_cvtsi64_sd(__m128d __a, long long __b)
3365 {
3366  __a[0] = __b;
3367  return __a;
3368 }
3369 
3370 /// Converts the first (lower) element of a vector of [2 x double] into a
3371 /// 64-bit signed integer value, according to the current rounding mode.
3372 ///
3373 /// \headerfile <x86intrin.h>
3374 ///
3375 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3376 ///
3377 /// \param __a
3378 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3379 /// conversion.
3380 /// \returns A 64-bit signed integer containing the converted value.
3381 static __inline__ long long __DEFAULT_FN_ATTRS
3382 _mm_cvtsd_si64(__m128d __a)
3383 {
3384  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3385 }
3386 
3387 /// Converts the first (lower) element of a vector of [2 x double] into a
3388 /// 64-bit signed integer value, truncating the result when it is inexact.
3389 ///
3390 /// \headerfile <x86intrin.h>
3391 ///
3392 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3393 /// instruction.
3394 ///
3395 /// \param __a
3396 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3397 /// conversion.
3398 /// \returns A 64-bit signed integer containing the converted value.
3399 static __inline__ long long __DEFAULT_FN_ATTRS
3400 _mm_cvttsd_si64(__m128d __a)
3401 {
3402  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3403 }
3404 #endif
3405 
3406 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3407 ///
3408 /// \headerfile <x86intrin.h>
3409 ///
3410 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3411 ///
3412 /// \param __a
3413 /// A 128-bit integer vector.
3414 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3415 static __inline__ __m128 __DEFAULT_FN_ATTRS
3417 {
3418  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3419 }
3420 
3421 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3422 ///
3423 /// \headerfile <x86intrin.h>
3424 ///
3425 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3426 ///
3427 /// \param __a
3428 /// A 128-bit vector of [4 x float].
3429 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3430 /// values.
3431 static __inline__ __m128i __DEFAULT_FN_ATTRS
3433 {
3434  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3435 }
3436 
3437 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3438 /// truncating the result when it is inexact.
3439 ///
3440 /// \headerfile <x86intrin.h>
3441 ///
3442 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3443 /// instruction.
3444 ///
3445 /// \param __a
3446 /// A 128-bit vector of [4 x float].
3447 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3448 static __inline__ __m128i __DEFAULT_FN_ATTRS
3450 {
3451  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3452 }
3453 
3454 /// Returns a vector of [4 x i32] where the lowest element is the input
3455 /// operand and the remaining elements are zero.
3456 ///
3457 /// \headerfile <x86intrin.h>
3458 ///
3459 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3460 ///
3461 /// \param __a
3462 /// A 32-bit signed integer operand.
3463 /// \returns A 128-bit vector of [4 x i32].
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS
3466 {
3467  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3468 }
3469 
3470 #ifdef __x86_64__
3471 /// Returns a vector of [2 x i64] where the lower element is the input
3472 /// operand and the upper element is zero.
3473 ///
3474 /// \headerfile <x86intrin.h>
3475 ///
3476 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3477 ///
3478 /// \param __a
3479 /// A 64-bit signed integer operand containing the value to be converted.
3480 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3481 static __inline__ __m128i __DEFAULT_FN_ATTRS
3482 _mm_cvtsi64_si128(long long __a)
3483 {
3484  return __extension__ (__m128i)(__v2di){ __a, 0 };
3485 }
3486 #endif
3487 
3488 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3489 /// 32-bit signed integer value.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3494 ///
3495 /// \param __a
3496 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3497 /// destination.
3498 /// \returns A 32-bit signed integer containing the moved value.
3499 static __inline__ int __DEFAULT_FN_ATTRS
3501 {
3502  __v4si __b = (__v4si)__a;
3503  return __b[0];
3504 }
3505 
3506 #ifdef __x86_64__
3507 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3508 /// 64-bit signed integer value.
3509 ///
3510 /// \headerfile <x86intrin.h>
3511 ///
3512 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3513 ///
3514 /// \param __a
3515 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3516 /// destination.
3517 /// \returns A 64-bit signed integer containing the moved value.
3518 static __inline__ long long __DEFAULT_FN_ATTRS
3519 _mm_cvtsi128_si64(__m128i __a)
3520 {
3521  return __a[0];
3522 }
3523 #endif
3524 
3525 /// Moves packed integer values from an aligned 128-bit memory location
3526 /// to elements in a 128-bit integer vector.
3527 ///
3528 /// \headerfile <x86intrin.h>
3529 ///
3530 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3531 ///
3532 /// \param __p
3533 /// An aligned pointer to a memory location containing integer values.
3534 /// \returns A 128-bit integer vector containing the moved values.
3535 static __inline__ __m128i __DEFAULT_FN_ATTRS
3536 _mm_load_si128(__m128i const *__p)
3537 {
3538  return *__p;
3539 }
3540 
3541 /// Moves packed integer values from an unaligned 128-bit memory location
3542 /// to elements in a 128-bit integer vector.
3543 ///
3544 /// \headerfile <x86intrin.h>
3545 ///
3546 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3547 ///
3548 /// \param __p
3549 /// A pointer to a memory location containing integer values.
3550 /// \returns A 128-bit integer vector containing the moved values.
3551 static __inline__ __m128i __DEFAULT_FN_ATTRS
3552 _mm_loadu_si128(__m128i_u const *__p)
3553 {
3554  struct __loadu_si128 {
3555  __m128i_u __v;
3556  } __attribute__((__packed__, __may_alias__));
3557  return ((const struct __loadu_si128*)__p)->__v;
3558 }
3559 
3560 /// Returns a vector of [2 x i64] where the lower element is taken from
3561 /// the lower element of the operand, and the upper element is zero.
3562 ///
3563 /// \headerfile <x86intrin.h>
3564 ///
3565 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3566 ///
3567 /// \param __p
3568 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3569 /// the destination.
3570 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3571 /// moved value. The higher order bits are cleared.
3572 static __inline__ __m128i __DEFAULT_FN_ATTRS
3573 _mm_loadl_epi64(__m128i_u const *__p)
3574 {
3575  struct __mm_loadl_epi64_struct {
3576  long long __u;
3577  } __attribute__((__packed__, __may_alias__));
3578  return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3579 }
3580 
3581 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3582 /// This could be used as an argument to another intrinsic function where the
3583 /// argument is required but the value is not actually used.
3584 ///
3585 /// \headerfile <x86intrin.h>
3586 ///
3587 /// This intrinsic has no corresponding instruction.
3588 ///
3589 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3590 static __inline__ __m128i __DEFAULT_FN_ATTRS
3592 {
3593  return (__m128i)__builtin_ia32_undef128();
3594 }
3595 
3596 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3597 /// the specified 64-bit integer values.
3598 ///
3599 /// \headerfile <x86intrin.h>
3600 ///
3601 /// This intrinsic is a utility function and does not correspond to a specific
3602 /// instruction.
3603 ///
3604 /// \param __q1
3605 /// A 64-bit integer value used to initialize the upper 64 bits of the
3606 /// destination vector of [2 x i64].
3607 /// \param __q0
3608 /// A 64-bit integer value used to initialize the lower 64 bits of the
3609 /// destination vector of [2 x i64].
3610 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3611 /// provided in the operands.
3612 static __inline__ __m128i __DEFAULT_FN_ATTRS
3613 _mm_set_epi64x(long long __q1, long long __q0)
3614 {
3615  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3616 }
3617 
3618 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3619 /// the specified 64-bit integer values.
3620 ///
3621 /// \headerfile <x86intrin.h>
3622 ///
3623 /// This intrinsic is a utility function and does not correspond to a specific
3624 /// instruction.
3625 ///
3626 /// \param __q1
3627 /// A 64-bit integer value used to initialize the upper 64 bits of the
3628 /// destination vector of [2 x i64].
3629 /// \param __q0
3630 /// A 64-bit integer value used to initialize the lower 64 bits of the
3631 /// destination vector of [2 x i64].
3632 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3633 /// provided in the operands.
3634 static __inline__ __m128i __DEFAULT_FN_ATTRS
3635 _mm_set_epi64(__m64 __q1, __m64 __q0)
3636 {
3637  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3638 }
3639 
3640 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3641 /// the specified 32-bit integer values.
3642 ///
3643 /// \headerfile <x86intrin.h>
3644 ///
3645 /// This intrinsic is a utility function and does not correspond to a specific
3646 /// instruction.
3647 ///
3648 /// \param __i3
3649 /// A 32-bit integer value used to initialize bits [127:96] of the
3650 /// destination vector.
3651 /// \param __i2
3652 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3653 /// vector.
3654 /// \param __i1
3655 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3656 /// vector.
3657 /// \param __i0
3658 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3659 /// vector.
3660 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3661 /// provided in the operands.
3662 static __inline__ __m128i __DEFAULT_FN_ATTRS
3663 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3664 {
3665  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3666 }
3667 
3668 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3669 /// the specified 16-bit integer values.
3670 ///
3671 /// \headerfile <x86intrin.h>
3672 ///
3673 /// This intrinsic is a utility function and does not correspond to a specific
3674 /// instruction.
3675 ///
3676 /// \param __w7
3677 /// A 16-bit integer value used to initialize bits [127:112] of the
3678 /// destination vector.
3679 /// \param __w6
3680 /// A 16-bit integer value used to initialize bits [111:96] of the
3681 /// destination vector.
3682 /// \param __w5
3683 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3684 /// vector.
3685 /// \param __w4
3686 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3687 /// vector.
3688 /// \param __w3
3689 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3690 /// vector.
3691 /// \param __w2
3692 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3693 /// vector.
3694 /// \param __w1
3695 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3696 /// vector.
3697 /// \param __w0
3698 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3699 /// vector.
3700 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3701 /// provided in the operands.
3702 static __inline__ __m128i __DEFAULT_FN_ATTRS
3703 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3704 {
3705  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3706 }
3707 
3708 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3709 /// the specified 8-bit integer values.
3710 ///
3711 /// \headerfile <x86intrin.h>
3712 ///
3713 /// This intrinsic is a utility function and does not correspond to a specific
3714 /// instruction.
3715 ///
3716 /// \param __b15
3717 /// Initializes bits [127:120] of the destination vector.
3718 /// \param __b14
3719 /// Initializes bits [119:112] of the destination vector.
3720 /// \param __b13
3721 /// Initializes bits [111:104] of the destination vector.
3722 /// \param __b12
3723 /// Initializes bits [103:96] of the destination vector.
3724 /// \param __b11
3725 /// Initializes bits [95:88] of the destination vector.
3726 /// \param __b10
3727 /// Initializes bits [87:80] of the destination vector.
3728 /// \param __b9
3729 /// Initializes bits [79:72] of the destination vector.
3730 /// \param __b8
3731 /// Initializes bits [71:64] of the destination vector.
3732 /// \param __b7
3733 /// Initializes bits [63:56] of the destination vector.
3734 /// \param __b6
3735 /// Initializes bits [55:48] of the destination vector.
3736 /// \param __b5
3737 /// Initializes bits [47:40] of the destination vector.
3738 /// \param __b4
3739 /// Initializes bits [39:32] of the destination vector.
3740 /// \param __b3
3741 /// Initializes bits [31:24] of the destination vector.
3742 /// \param __b2
3743 /// Initializes bits [23:16] of the destination vector.
3744 /// \param __b1
3745 /// Initializes bits [15:8] of the destination vector.
3746 /// \param __b0
3747 /// Initializes bits [7:0] of the destination vector.
3748 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3749 /// provided in the operands.
3750 static __inline__ __m128i __DEFAULT_FN_ATTRS
3751 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3752 {
3753  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3754 }
3755 
3756 /// Initializes both values in a 128-bit integer vector with the
3757 /// specified 64-bit integer value.
3758 ///
3759 /// \headerfile <x86intrin.h>
3760 ///
3761 /// This intrinsic is a utility function and does not correspond to a specific
3762 /// instruction.
3763 ///
3764 /// \param __q
3765 /// Integer value used to initialize the elements of the destination integer
3766 /// vector.
3767 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3768 /// elements containing the value provided in the operand.
3769 static __inline__ __m128i __DEFAULT_FN_ATTRS
3770 _mm_set1_epi64x(long long __q)
3771 {
3772  return _mm_set_epi64x(__q, __q);
3773 }
3774 
3775 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3776 /// specified 64-bit value.
3777 ///
3778 /// \headerfile <x86intrin.h>
3779 ///
3780 /// This intrinsic is a utility function and does not correspond to a specific
3781 /// instruction.
3782 ///
3783 /// \param __q
3784 /// A 64-bit value used to initialize the elements of the destination integer
3785 /// vector.
3786 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3787 /// containing the value provided in the operand.
3788 static __inline__ __m128i __DEFAULT_FN_ATTRS
3789 _mm_set1_epi64(__m64 __q)
3790 {
3791  return _mm_set_epi64(__q, __q);
3792 }
3793 
3794 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3795 /// specified 32-bit value.
3796 ///
3797 /// \headerfile <x86intrin.h>
3798 ///
3799 /// This intrinsic is a utility function and does not correspond to a specific
3800 /// instruction.
3801 ///
3802 /// \param __i
3803 /// A 32-bit value used to initialize the elements of the destination integer
3804 /// vector.
3805 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3806 /// containing the value provided in the operand.
3807 static __inline__ __m128i __DEFAULT_FN_ATTRS
3809 {
3810  return _mm_set_epi32(__i, __i, __i, __i);
3811 }
3812 
3813 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3814 /// specified 16-bit value.
3815 ///
3816 /// \headerfile <x86intrin.h>
3817 ///
3818 /// This intrinsic is a utility function and does not correspond to a specific
3819 /// instruction.
3820 ///
3821 /// \param __w
3822 /// A 16-bit value used to initialize the elements of the destination integer
3823 /// vector.
3824 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3825 /// containing the value provided in the operand.
3826 static __inline__ __m128i __DEFAULT_FN_ATTRS
3827 _mm_set1_epi16(short __w)
3828 {
3829  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3830 }
3831 
3832 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3833 /// specified 8-bit value.
3834 ///
3835 /// \headerfile <x86intrin.h>
3836 ///
3837 /// This intrinsic is a utility function and does not correspond to a specific
3838 /// instruction.
3839 ///
3840 /// \param __b
3841 /// An 8-bit value used to initialize the elements of the destination integer
3842 /// vector.
3843 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3844 /// containing the value provided in the operand.
3845 static __inline__ __m128i __DEFAULT_FN_ATTRS
3847 {
3848  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3849 }
3850 
3851 /// Constructs a 128-bit integer vector, initialized in reverse order
3852 /// with the specified 64-bit integral values.
3853 ///
3854 /// \headerfile <x86intrin.h>
3855 ///
3856 /// This intrinsic does not correspond to a specific instruction.
3857 ///
3858 /// \param __q0
3859 /// A 64-bit integral value used to initialize the lower 64 bits of the
3860 /// result.
3861 /// \param __q1
3862 /// A 64-bit integral value used to initialize the upper 64 bits of the
3863 /// result.
3864 /// \returns An initialized 128-bit integer vector.
3865 static __inline__ __m128i __DEFAULT_FN_ATTRS
3866 _mm_setr_epi64(__m64 __q0, __m64 __q1)
3867 {
3868  return _mm_set_epi64(__q1, __q0);
3869 }
3870 
3871 /// Constructs a 128-bit integer vector, initialized in reverse order
3872 /// with the specified 32-bit integral values.
3873 ///
3874 /// \headerfile <x86intrin.h>
3875 ///
3876 /// This intrinsic is a utility function and does not correspond to a specific
3877 /// instruction.
3878 ///
3879 /// \param __i0
3880 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3881 /// \param __i1
3882 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3883 /// \param __i2
3884 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3885 /// \param __i3
3886 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3887 /// \returns An initialized 128-bit integer vector.
3888 static __inline__ __m128i __DEFAULT_FN_ATTRS
3889 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3890 {
3891  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3892 }
3893 
3894 /// Constructs a 128-bit integer vector, initialized in reverse order
3895 /// with the specified 16-bit integral values.
3896 ///
3897 /// \headerfile <x86intrin.h>
3898 ///
3899 /// This intrinsic is a utility function and does not correspond to a specific
3900 /// instruction.
3901 ///
3902 /// \param __w0
3903 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3904 /// \param __w1
3905 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3906 /// \param __w2
3907 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3908 /// \param __w3
3909 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3910 /// \param __w4
3911 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3912 /// \param __w5
3913 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3914 /// \param __w6
3915 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3916 /// \param __w7
3917 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3918 /// \returns An initialized 128-bit integer vector.
3919 static __inline__ __m128i __DEFAULT_FN_ATTRS
3920 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3921 {
3922  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3923 }
3924 
3925 /// Constructs a 128-bit integer vector, initialized in reverse order
3926 /// with the specified 8-bit integral values.
3927 ///
3928 /// \headerfile <x86intrin.h>
3929 ///
3930 /// This intrinsic is a utility function and does not correspond to a specific
3931 /// instruction.
3932 ///
3933 /// \param __b0
3934 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3935 /// \param __b1
3936 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3937 /// \param __b2
3938 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3939 /// \param __b3
3940 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3941 /// \param __b4
3942 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3943 /// \param __b5
3944 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3945 /// \param __b6
3946 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3947 /// \param __b7
3948 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3949 /// \param __b8
3950 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3951 /// \param __b9
3952 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3953 /// \param __b10
3954 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3955 /// \param __b11
3956 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3957 /// \param __b12
3958 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3959 /// \param __b13
3960 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3961 /// \param __b14
3962 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3963 /// \param __b15
3964 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3965 /// \returns An initialized 128-bit integer vector.
3966 static __inline__ __m128i __DEFAULT_FN_ATTRS
3967 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3968 {
3969  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3970 }
3971 
3972 /// Creates a 128-bit integer vector initialized to zero.
3973 ///
3974 /// \headerfile <x86intrin.h>
3975 ///
3976 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3977 ///
3978 /// \returns An initialized 128-bit integer vector with all elements set to
3979 /// zero.
3980 static __inline__ __m128i __DEFAULT_FN_ATTRS
3982 {
3983  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
3984 }
3985 
3986 /// Stores a 128-bit integer vector to a memory location aligned on a
3987 /// 128-bit boundary.
3988 ///
3989 /// \headerfile <x86intrin.h>
3990 ///
3991 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3992 ///
3993 /// \param __p
3994 /// A pointer to an aligned memory location that will receive the integer
3995 /// values.
3996 /// \param __b
3997 /// A 128-bit integer vector containing the values to be moved.
3998 static __inline__ void __DEFAULT_FN_ATTRS
3999 _mm_store_si128(__m128i *__p, __m128i __b)
4000 {
4001  *__p = __b;
4002 }
4003 
4004 /// Stores a 128-bit integer vector to an unaligned memory location.
4005 ///
4006 /// \headerfile <x86intrin.h>
4007 ///
4008 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
4009 ///
4010 /// \param __p
4011 /// A pointer to a memory location that will receive the integer values.
4012 /// \param __b
4013 /// A 128-bit integer vector containing the values to be moved.
4014 static __inline__ void __DEFAULT_FN_ATTRS
4015 _mm_storeu_si128(__m128i_u *__p, __m128i __b)
4016 {
4017  struct __storeu_si128 {
4018  __m128i_u __v;
4019  } __attribute__((__packed__, __may_alias__));
4020  ((struct __storeu_si128*)__p)->__v = __b;
4021 }
4022 
4023 /// Stores a 64-bit integer value from the low element of a 128-bit integer
4024 /// vector.
4025 ///
4026 /// \headerfile <x86intrin.h>
4027 ///
4028 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4029 ///
4030 /// \param __p
4031 /// A pointer to a 64-bit memory location. The address of the memory
4032 /// location does not have to be aligned.
4033 /// \param __b
4034 /// A 128-bit integer vector containing the value to be stored.
4035 static __inline__ void __DEFAULT_FN_ATTRS
4036 _mm_storeu_si64(void *__p, __m128i __b)
4037 {
4038  struct __storeu_si64 {
4039  long long __v;
4040  } __attribute__((__packed__, __may_alias__));
4041  ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
4042 }
4043 
4044 /// Stores a 32-bit integer value from the low element of a 128-bit integer
4045 /// vector.
4046 ///
4047 /// \headerfile <x86intrin.h>
4048 ///
4049 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
4050 ///
4051 /// \param __p
4052 /// A pointer to a 32-bit memory location. The address of the memory
4053 /// location does not have to be aligned.
4054 /// \param __b
4055 /// A 128-bit integer vector containing the value to be stored.
4056 static __inline__ void __DEFAULT_FN_ATTRS
4057 _mm_storeu_si32(void *__p, __m128i __b)
4058 {
4059  struct __storeu_si32 {
4060  int __v;
4061  } __attribute__((__packed__, __may_alias__));
4062  ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
4063 }
4064 
4065 /// Stores a 16-bit integer value from the low element of a 128-bit integer
4066 /// vector.
4067 ///
4068 /// \headerfile <x86intrin.h>
4069 ///
4070 /// This intrinsic does not correspond to a specific instruction.
4071 ///
4072 /// \param __p
4073 /// A pointer to a 16-bit memory location. The address of the memory
4074 /// location does not have to be aligned.
4075 /// \param __b
4076 /// A 128-bit integer vector containing the value to be stored.
4077 static __inline__ void __DEFAULT_FN_ATTRS
4078 _mm_storeu_si16(void *__p, __m128i __b)
4079 {
4080  struct __storeu_si16 {
4081  short __v;
4082  } __attribute__((__packed__, __may_alias__));
4083  ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
4084 }
4085 
4086 /// Moves bytes selected by the mask from the first operand to the
4087 /// specified unaligned memory location. When a mask bit is 1, the
4088 /// corresponding byte is written, otherwise it is not written.
4089 ///
4090 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4091 /// used again soon). Exception and trap behavior for elements not selected
4092 /// for storage to memory are implementation dependent.
4093 ///
4094 /// \headerfile <x86intrin.h>
4095 ///
4096 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4097 /// instruction.
4098 ///
4099 /// \param __d
4100 /// A 128-bit integer vector containing the values to be moved.
4101 /// \param __n
4102 /// A 128-bit integer vector containing the mask. The most significant bit of
4103 /// each byte represents the mask bits.
4104 /// \param __p
4105 /// A pointer to an unaligned 128-bit memory location where the specified
4106 /// values are moved.
4107 static __inline__ void __DEFAULT_FN_ATTRS
4108 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4109 {
4110  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4111 }
4112 
4113 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4114 /// a memory location.
4115 ///
4116 /// \headerfile <x86intrin.h>
4117 ///
4118 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4119 ///
4120 /// \param __p
4121 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
4122 /// of the integer vector parameter.
4123 /// \param __a
4124 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4125 /// value to be stored.
4126 static __inline__ void __DEFAULT_FN_ATTRS
4127 _mm_storel_epi64(__m128i_u *__p, __m128i __a)
4128 {
4129  struct __mm_storel_epi64_struct {
4130  long long __u;
4131  } __attribute__((__packed__, __may_alias__));
4132  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4133 }
4134 
4135 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4136 /// aligned memory location.
4137 ///
4138 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4139 /// used again soon).
4140 ///
4141 /// \headerfile <x86intrin.h>
4142 ///
4143 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4144 ///
4145 /// \param __p
4146 /// A pointer to the 128-bit aligned memory location used to store the value.
4147 /// \param __a
4148 /// A vector of [2 x double] containing the 64-bit values to be stored.
4149 static __inline__ void __DEFAULT_FN_ATTRS
4150 _mm_stream_pd(double *__p, __m128d __a)
4151 {
4152  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4153 }
4154 
4155 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4156 ///
4157 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4158 /// used again soon).
4159 ///
4160 /// \headerfile <x86intrin.h>
4161 ///
4162 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4163 ///
4164 /// \param __p
4165 /// A pointer to the 128-bit aligned memory location used to store the value.
4166 /// \param __a
4167 /// A 128-bit integer vector containing the values to be stored.
4168 static __inline__ void __DEFAULT_FN_ATTRS
4169 _mm_stream_si128(__m128i *__p, __m128i __a)
4170 {
4171  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4172 }
4173 
4174 /// Stores a 32-bit integer value in the specified memory location.
4175 ///
4176 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4177 /// used again soon).
4178 ///
4179 /// \headerfile <x86intrin.h>
4180 ///
4181 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4182 ///
4183 /// \param __p
4184 /// A pointer to the 32-bit memory location used to store the value.
4185 /// \param __a
4186 /// A 32-bit integer containing the value to be stored.
4187 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4188 _mm_stream_si32(int *__p, int __a)
4190  __builtin_ia32_movnti(__p, __a);
4191 }
4192 
4193 #ifdef __x86_64__
4194 /// Stores a 64-bit integer value in the specified memory location.
4195 ///
4196 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4197 /// used again soon).
4198 ///
4199 /// \headerfile <x86intrin.h>
4200 ///
4201 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4202 ///
4203 /// \param __p
4204 /// A pointer to the 64-bit memory location used to store the value.
4205 /// \param __a
4206 /// A 64-bit integer containing the value to be stored.
4207 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4208 _mm_stream_si64(long long *__p, long long __a)
4209 {
4210  __builtin_ia32_movnti64(__p, __a);
4211 }
4212 #endif
4213 
4214 #if defined(__cplusplus)
4215 extern "C" {
4216 #endif
4217 
4218 /// The cache line containing \a __p is flushed and invalidated from all
4219 /// caches in the coherency domain.
4220 ///
4221 /// \headerfile <x86intrin.h>
4222 ///
4223 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4224 ///
4225 /// \param __p
4226 /// A pointer to the memory location used to identify the cache line to be
4227 /// flushed.
4228 void _mm_clflush(void const * __p);
4229 
4230 /// Forces strong memory ordering (serialization) between load
4231 /// instructions preceding this instruction and load instructions following
4232 /// this instruction, ensuring the system completes all previous loads before
4233 /// executing subsequent loads.
4234 ///
4235 /// \headerfile <x86intrin.h>
4236 ///
4237 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4238 ///
4239 void _mm_lfence(void);
4240 
4241 /// Forces strong memory ordering (serialization) between load and store
4242 /// instructions preceding this instruction and load and store instructions
4243 /// following this instruction, ensuring that the system completes all
4244 /// previous memory accesses before executing subsequent memory accesses.
4245 ///
4246 /// \headerfile <x86intrin.h>
4247 ///
4248 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4249 ///
4250 void _mm_mfence(void);
4251 
4252 #if defined(__cplusplus)
4253 } // extern "C"
4254 #endif
4255 
4256 /// Converts 16-bit signed integers from both 128-bit integer vector
4257 /// operands into 8-bit signed integers, and packs the results into the
4258 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4259 /// Negative values less than 0x80 are saturated to 0x80.
4260 ///
4261 /// \headerfile <x86intrin.h>
4262 ///
4263 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4264 ///
4265 /// \param __a
4266 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4267 /// a signed integer and is converted to a 8-bit signed integer with
4268 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4269 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4270 /// written to the lower 64 bits of the result.
4271 /// \param __b
4272 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4273 /// a signed integer and is converted to a 8-bit signed integer with
4274 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4275 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4276 /// written to the higher 64 bits of the result.
4277 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4278 static __inline__ __m128i __DEFAULT_FN_ATTRS
4279 _mm_packs_epi16(__m128i __a, __m128i __b)
4280 {
4281  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4282 }
4283 
4284 /// Converts 32-bit signed integers from both 128-bit integer vector
4285 /// operands into 16-bit signed integers, and packs the results into the
4286 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4287 /// Negative values less than 0x8000 are saturated to 0x8000.
4288 ///
4289 /// \headerfile <x86intrin.h>
4290 ///
4291 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4292 ///
4293 /// \param __a
4294 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4295 /// a signed integer and is converted to a 16-bit signed integer with
4296 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4297 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4298 /// are written to the lower 64 bits of the result.
4299 /// \param __b
4300 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4301 /// a signed integer and is converted to a 16-bit signed integer with
4302 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4303 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4304 /// are written to the higher 64 bits of the result.
4305 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4306 static __inline__ __m128i __DEFAULT_FN_ATTRS
4307 _mm_packs_epi32(__m128i __a, __m128i __b)
4308 {
4309  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4310 }
4311 
4312 /// Converts 16-bit signed integers from both 128-bit integer vector
4313 /// operands into 8-bit unsigned integers, and packs the results into the
4314 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4315 /// than 0x00 are saturated to 0x00.
4316 ///
4317 /// \headerfile <x86intrin.h>
4318 ///
4319 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4320 ///
4321 /// \param __a
4322 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4323 /// a signed integer and is converted to an 8-bit unsigned integer with
4324 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4325 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4326 /// written to the lower 64 bits of the result.
4327 /// \param __b
4328 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4329 /// a signed integer and is converted to an 8-bit unsigned integer with
4330 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4331 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4332 /// written to the higher 64 bits of the result.
4333 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4334 static __inline__ __m128i __DEFAULT_FN_ATTRS
4335 _mm_packus_epi16(__m128i __a, __m128i __b)
4336 {
4337  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4338 }
4339 
4340 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4341 /// the immediate-value parameter as a selector.
4342 ///
4343 /// \headerfile <x86intrin.h>
4344 ///
4345 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4346 ///
4347 /// \param __a
4348 /// A 128-bit integer vector.
4349 /// \param __imm
4350 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned
4351 /// to bits[15:0] of the result. \n
4352 /// 000: assign values from bits [15:0] of \a __a. \n
4353 /// 001: assign values from bits [31:16] of \a __a. \n
4354 /// 010: assign values from bits [47:32] of \a __a. \n
4355 /// 011: assign values from bits [63:48] of \a __a. \n
4356 /// 100: assign values from bits [79:64] of \a __a. \n
4357 /// 101: assign values from bits [95:80] of \a __a. \n
4358 /// 110: assign values from bits [111:96] of \a __a. \n
4359 /// 111: assign values from bits [127:112] of \a __a.
4360 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4361 /// integer vector parameter and the remaining bits are assigned zeros.
4362 #define _mm_extract_epi16(a, imm) \
4363  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4364  (int)(imm)))
4365 
4366 /// Constructs a 128-bit integer vector by first making a copy of the
4367 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4368 /// of an integer parameter into an offset specified by the immediate-value
4369 /// parameter.
4370 ///
4371 /// \headerfile <x86intrin.h>
4372 ///
4373 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4374 ///
4375 /// \param __a
4376 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4377 /// result and then one of the eight elements in the result is replaced by
4378 /// the lower 16 bits of \a __b.
4379 /// \param __b
4380 /// An integer. The lower 16 bits of this parameter are written to the
4381 /// result beginning at an offset specified by \a __imm.
4382 /// \param __imm
4383 /// An immediate value specifying the bit offset in the result at which the
4384 /// lower 16 bits of \a __b are written.
4385 /// \returns A 128-bit integer vector containing the constructed values.
4386 #define _mm_insert_epi16(a, b, imm) \
4387  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4388  (int)(imm)))
4389 
4390 /// Copies the values of the most significant bits from each 8-bit
4391 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4392 /// value, zero-extends the value, and writes it to the destination.
4393 ///
4394 /// \headerfile <x86intrin.h>
4395 ///
4396 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4397 ///
4398 /// \param __a
4399 /// A 128-bit integer vector containing the values with bits to be extracted.
4400 /// \returns The most significant bits from each 8-bit element in \a __a,
4401 /// written to bits [15:0]. The other bits are assigned zeros.
4402 static __inline__ int __DEFAULT_FN_ATTRS
4404 {
4405  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4406 }
4407 
4408 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4409 /// elements of a 128-bit integer vector parameter, using the immediate-value
4410 /// parameter as a specifier.
4411 ///
4412 /// \headerfile <x86intrin.h>
4413 ///
4414 /// \code
4415 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4416 /// \endcode
4417 ///
4418 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4419 ///
4420 /// \param a
4421 /// A 128-bit integer vector containing the values to be copied.
4422 /// \param imm
4423 /// An immediate value containing an 8-bit value specifying which elements to
4424 /// copy from a. The destinations within the 128-bit destination are assigned
4425 /// values as follows: \n
4426 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4427 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4428 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4429 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4430 /// Bit value assignments: \n
4431 /// 00: assign values from bits [31:0] of \a a. \n
4432 /// 01: assign values from bits [63:32] of \a a. \n
4433 /// 10: assign values from bits [95:64] of \a a. \n
4434 /// 11: assign values from bits [127:96] of \a a.
4435 /// \returns A 128-bit integer vector containing the shuffled values.
4436 #define _mm_shuffle_epi32(a, imm) \
4437  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4438 
4439 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4440 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4441 /// value parameter as a specifier.
4442 ///
4443 /// \headerfile <x86intrin.h>
4444 ///
4445 /// \code
4446 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4447 /// \endcode
4448 ///
4449 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4450 ///
4451 /// \param a
4452 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4453 /// [127:64] of the result.
4454 /// \param imm
4455 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4456 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4457 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4458 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4459 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4460 /// Bit value assignments: \n
4461 /// 00: assign values from bits [15:0] of \a a. \n
4462 /// 01: assign values from bits [31:16] of \a a. \n
4463 /// 10: assign values from bits [47:32] of \a a. \n
4464 /// 11: assign values from bits [63:48] of \a a. \n
4465 /// \returns A 128-bit integer vector containing the shuffled values.
4466 #define _mm_shufflelo_epi16(a, imm) \
4467  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4468 
4469 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4470 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4471 /// value parameter as a specifier.
4472 ///
4473 /// \headerfile <x86intrin.h>
4474 ///
4475 /// \code
4476 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4477 /// \endcode
4478 ///
4479 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4480 ///
4481 /// \param a
4482 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4483 /// [63:0] of the result.
4484 /// \param imm
4485 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4486 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4487 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4488 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4489 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4490 /// Bit value assignments: \n
4491 /// 00: assign values from bits [79:64] of \a a. \n
4492 /// 01: assign values from bits [95:80] of \a a. \n
4493 /// 10: assign values from bits [111:96] of \a a. \n
4494 /// 11: assign values from bits [127:112] of \a a. \n
4495 /// \returns A 128-bit integer vector containing the shuffled values.
4496 #define _mm_shufflehi_epi16(a, imm) \
4497  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4498 
4499 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4500 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4501 ///
4502 /// \headerfile <x86intrin.h>
4503 ///
4504 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4505 /// instruction.
4506 ///
4507 /// \param __a
4508 /// A 128-bit vector of [16 x i8].
4509 /// Bits [71:64] are written to bits [7:0] of the result. \n
4510 /// Bits [79:72] are written to bits [23:16] of the result. \n
4511 /// Bits [87:80] are written to bits [39:32] of the result. \n
4512 /// Bits [95:88] are written to bits [55:48] of the result. \n
4513 /// Bits [103:96] are written to bits [71:64] of the result. \n
4514 /// Bits [111:104] are written to bits [87:80] of the result. \n
4515 /// Bits [119:112] are written to bits [103:96] of the result. \n
4516 /// Bits [127:120] are written to bits [119:112] of the result.
4517 /// \param __b
4518 /// A 128-bit vector of [16 x i8]. \n
4519 /// Bits [71:64] are written to bits [15:8] of the result. \n
4520 /// Bits [79:72] are written to bits [31:24] of the result. \n
4521 /// Bits [87:80] are written to bits [47:40] of the result. \n
4522 /// Bits [95:88] are written to bits [63:56] of the result. \n
4523 /// Bits [103:96] are written to bits [79:72] of the result. \n
4524 /// Bits [111:104] are written to bits [95:88] of the result. \n
4525 /// Bits [119:112] are written to bits [111:104] of the result. \n
4526 /// Bits [127:120] are written to bits [127:120] of the result.
4527 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4528 static __inline__ __m128i __DEFAULT_FN_ATTRS
4529 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
4530 {
4531  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4532 }
4533 
4534 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4535 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4536 ///
4537 /// \headerfile <x86intrin.h>
4538 ///
4539 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4540 /// instruction.
4541 ///
4542 /// \param __a
4543 /// A 128-bit vector of [8 x i16].
4544 /// Bits [79:64] are written to bits [15:0] of the result. \n
4545 /// Bits [95:80] are written to bits [47:32] of the result. \n
4546 /// Bits [111:96] are written to bits [79:64] of the result. \n
4547 /// Bits [127:112] are written to bits [111:96] of the result.
4548 /// \param __b
4549 /// A 128-bit vector of [8 x i16].
4550 /// Bits [79:64] are written to bits [31:16] of the result. \n
4551 /// Bits [95:80] are written to bits [63:48] of the result. \n
4552 /// Bits [111:96] are written to bits [95:80] of the result. \n
4553 /// Bits [127:112] are written to bits [127:112] of the result.
4554 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4555 static __inline__ __m128i __DEFAULT_FN_ATTRS
4556 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
4557 {
4558  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4559 }
4560 
4561 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4562 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4563 ///
4564 /// \headerfile <x86intrin.h>
4565 ///
4566 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4567 /// instruction.
4568 ///
4569 /// \param __a
4570 /// A 128-bit vector of [4 x i32]. \n
4571 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4572 /// Bits [127:96] are written to bits [95:64] of the destination.
4573 /// \param __b
4574 /// A 128-bit vector of [4 x i32]. \n
4575 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4576 /// Bits [127:96] are written to bits [127:96] of the destination.
4577 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4578 static __inline__ __m128i __DEFAULT_FN_ATTRS
4579 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
4580 {
4581  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4582 }
4583 
4584 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4585 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4586 ///
4587 /// \headerfile <x86intrin.h>
4588 ///
4589 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4590 /// instruction.
4591 ///
4592 /// \param __a
4593 /// A 128-bit vector of [2 x i64]. \n
4594 /// Bits [127:64] are written to bits [63:0] of the destination.
4595 /// \param __b
4596 /// A 128-bit vector of [2 x i64]. \n
4597 /// Bits [127:64] are written to bits [127:64] of the destination.
4598 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4599 static __inline__ __m128i __DEFAULT_FN_ATTRS
4600 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
4601 {
4602  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4603 }
4604 
4605 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4606 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4607 ///
4608 /// \headerfile <x86intrin.h>
4609 ///
4610 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4611 /// instruction.
4612 ///
4613 /// \param __a
4614 /// A 128-bit vector of [16 x i8]. \n
4615 /// Bits [7:0] are written to bits [7:0] of the result. \n
4616 /// Bits [15:8] are written to bits [23:16] of the result. \n
4617 /// Bits [23:16] are written to bits [39:32] of the result. \n
4618 /// Bits [31:24] are written to bits [55:48] of the result. \n
4619 /// Bits [39:32] are written to bits [71:64] of the result. \n
4620 /// Bits [47:40] are written to bits [87:80] of the result. \n
4621 /// Bits [55:48] are written to bits [103:96] of the result. \n
4622 /// Bits [63:56] are written to bits [119:112] of the result.
4623 /// \param __b
4624 /// A 128-bit vector of [16 x i8].
4625 /// Bits [7:0] are written to bits [15:8] of the result. \n
4626 /// Bits [15:8] are written to bits [31:24] of the result. \n
4627 /// Bits [23:16] are written to bits [47:40] of the result. \n
4628 /// Bits [31:24] are written to bits [63:56] of the result. \n
4629 /// Bits [39:32] are written to bits [79:72] of the result. \n
4630 /// Bits [47:40] are written to bits [95:88] of the result. \n
4631 /// Bits [55:48] are written to bits [111:104] of the result. \n
4632 /// Bits [63:56] are written to bits [127:120] of the result.
4633 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4634 static __inline__ __m128i __DEFAULT_FN_ATTRS
4635 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
4636 {
4637  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4638 }
4639 
4640 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4641 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4642 /// [8 x i16].
4643 ///
4644 /// \headerfile <x86intrin.h>
4645 ///
4646 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4647 /// instruction.
4648 ///
4649 /// \param __a
4650 /// A 128-bit vector of [8 x i16].
4651 /// Bits [15:0] are written to bits [15:0] of the result. \n
4652 /// Bits [31:16] are written to bits [47:32] of the result. \n
4653 /// Bits [47:32] are written to bits [79:64] of the result. \n
4654 /// Bits [63:48] are written to bits [111:96] of the result.
4655 /// \param __b
4656 /// A 128-bit vector of [8 x i16].
4657 /// Bits [15:0] are written to bits [31:16] of the result. \n
4658 /// Bits [31:16] are written to bits [63:48] of the result. \n
4659 /// Bits [47:32] are written to bits [95:80] of the result. \n
4660 /// Bits [63:48] are written to bits [127:112] of the result.
4661 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4662 static __inline__ __m128i __DEFAULT_FN_ATTRS
4663 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
4664 {
4665  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4666 }
4667 
4668 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4669 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4670 ///
4671 /// \headerfile <x86intrin.h>
4672 ///
4673 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4674 /// instruction.
4675 ///
4676 /// \param __a
4677 /// A 128-bit vector of [4 x i32]. \n
4678 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4679 /// Bits [63:32] are written to bits [95:64] of the destination.
4680 /// \param __b
4681 /// A 128-bit vector of [4 x i32]. \n
4682 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4683 /// Bits [63:32] are written to bits [127:96] of the destination.
4684 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4685 static __inline__ __m128i __DEFAULT_FN_ATTRS
4686 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
4687 {
4688  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4689 }
4690 
4691 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4692 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4693 ///
4694 /// \headerfile <x86intrin.h>
4695 ///
4696 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4697 /// instruction.
4698 ///
4699 /// \param __a
4700 /// A 128-bit vector of [2 x i64]. \n
4701 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4702 /// \param __b
4703 /// A 128-bit vector of [2 x i64]. \n
4704 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4705 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4706 static __inline__ __m128i __DEFAULT_FN_ATTRS
4707 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
4708 {
4709  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4710 }
4711 
4712 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4713 /// integer.
4714 ///
4715 /// \headerfile <x86intrin.h>
4716 ///
4717 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4718 ///
4719 /// \param __a
4720 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4721 /// destination.
4722 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4723 static __inline__ __m64 __DEFAULT_FN_ATTRS
4725 {
4726  return (__m64)__a[0];
4727 }
4728 
4729 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4730 /// upper bits.
4731 ///
4732 /// \headerfile <x86intrin.h>
4733 ///
4734 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4735 ///
4736 /// \param __a
4737 /// A 64-bit value.
4738 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4739 /// the operand. The upper 64 bits are assigned zeros.
4740 static __inline__ __m128i __DEFAULT_FN_ATTRS
4742 {
4743  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4744 }
4745 
4746 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4747 /// integer vector, zeroing the upper bits.
4748 ///
4749 /// \headerfile <x86intrin.h>
4750 ///
4751 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4752 ///
4753 /// \param __a
4754 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4755 /// destination.
4756 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4757 /// the operand. The upper 64 bits are assigned zeros.
4758 static __inline__ __m128i __DEFAULT_FN_ATTRS
4760 {
4761  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4762 }
4763 
4764 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4765 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4766 /// double].
4767 ///
4768 /// \headerfile <x86intrin.h>
4769 ///
4770 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4771 ///
4772 /// \param __a
4773 /// A 128-bit vector of [2 x double]. \n
4774 /// Bits [127:64] are written to bits [63:0] of the destination.
4775 /// \param __b
4776 /// A 128-bit vector of [2 x double]. \n
4777 /// Bits [127:64] are written to bits [127:64] of the destination.
4778 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4779 static __inline__ __m128d __DEFAULT_FN_ATTRS
4780 _mm_unpackhi_pd(__m128d __a, __m128d __b)
4781 {
4782  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4783 }
4784 
4785 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4786 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4787 /// double].
4788 ///
4789 /// \headerfile <x86intrin.h>
4790 ///
4791 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4792 ///
4793 /// \param __a
4794 /// A 128-bit vector of [2 x double]. \n
4795 /// Bits [63:0] are written to bits [63:0] of the destination.
4796 /// \param __b
4797 /// A 128-bit vector of [2 x double]. \n
4798 /// Bits [63:0] are written to bits [127:64] of the destination.
4799 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4800 static __inline__ __m128d __DEFAULT_FN_ATTRS
4801 _mm_unpacklo_pd(__m128d __a, __m128d __b)
4802 {
4803  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4804 }
4805 
4806 /// Extracts the sign bits of the double-precision values in the 128-bit
4807 /// vector of [2 x double], zero-extends the value, and writes it to the
4808 /// low-order bits of the destination.
4809 ///
4810 /// \headerfile <x86intrin.h>
4811 ///
4812 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4813 ///
4814 /// \param __a
4815 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4816 /// be extracted.
4817 /// \returns The sign bits from each of the double-precision elements in \a __a,
4818 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4819 static __inline__ int __DEFAULT_FN_ATTRS
4821 {
4822  return __builtin_ia32_movmskpd((__v2df)__a);
4823 }
4824 
4825 
4826 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4827 /// 128-bit vector parameters of [2 x double], using the immediate-value
4828 /// parameter as a specifier.
4829 ///
4830 /// \headerfile <x86intrin.h>
4831 ///
4832 /// \code
4833 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4834 /// \endcode
4835 ///
4836 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4837 ///
4838 /// \param a
4839 /// A 128-bit vector of [2 x double].
4840 /// \param b
4841 /// A 128-bit vector of [2 x double].
4842 /// \param i
4843 /// An 8-bit immediate value. The least significant two bits specify which
4844 /// elements to copy from \a a and \a b: \n
4845 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4846 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4847 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4848 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4849 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4850 #define _mm_shuffle_pd(a, b, i) \
4851  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4852  (int)(i)))
4853 
4854 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4855 /// floating-point vector of [4 x float].
4856 ///
4857 /// \headerfile <x86intrin.h>
4858 ///
4859 /// This intrinsic has no corresponding instruction.
4860 ///
4861 /// \param __a
4862 /// A 128-bit floating-point vector of [2 x double].
4863 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4864 /// bitwise pattern as the parameter.
4865 static __inline__ __m128 __DEFAULT_FN_ATTRS
4867 {
4868  return (__m128)__a;
4869 }
4870 
4871 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4872 /// integer vector.
4873 ///
4874 /// \headerfile <x86intrin.h>
4875 ///
4876 /// This intrinsic has no corresponding instruction.
4877 ///
4878 /// \param __a
4879 /// A 128-bit floating-point vector of [2 x double].
4880 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4881 /// parameter.
4882 static __inline__ __m128i __DEFAULT_FN_ATTRS
4884 {
4885  return (__m128i)__a;
4886 }
4887 
4888 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4889 /// floating-point vector of [2 x double].
4890 ///
4891 /// \headerfile <x86intrin.h>
4892 ///
4893 /// This intrinsic has no corresponding instruction.
4894 ///
4895 /// \param __a
4896 /// A 128-bit floating-point vector of [4 x float].
4897 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4898 /// bitwise pattern as the parameter.
4899 static __inline__ __m128d __DEFAULT_FN_ATTRS
4901 {
4902  return (__m128d)__a;
4903 }
4904 
4905 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4906 /// integer vector.
4907 ///
4908 /// \headerfile <x86intrin.h>
4909 ///
4910 /// This intrinsic has no corresponding instruction.
4911 ///
4912 /// \param __a
4913 /// A 128-bit floating-point vector of [4 x float].
4914 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4915 /// parameter.
4916 static __inline__ __m128i __DEFAULT_FN_ATTRS
4918 {
4919  return (__m128i)__a;
4920 }
4921 
4922 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4923 /// of [4 x float].
4924 ///
4925 /// \headerfile <x86intrin.h>
4926 ///
4927 /// This intrinsic has no corresponding instruction.
4928 ///
4929 /// \param __a
4930 /// A 128-bit integer vector.
4931 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4932 /// bitwise pattern as the parameter.
4933 static __inline__ __m128 __DEFAULT_FN_ATTRS
4935 {
4936  return (__m128)__a;
4937 }
4938 
4939 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4940 /// of [2 x double].
4941 ///
4942 /// \headerfile <x86intrin.h>
4943 ///
4944 /// This intrinsic has no corresponding instruction.
4945 ///
4946 /// \param __a
4947 /// A 128-bit integer vector.
4948 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4949 /// bitwise pattern as the parameter.
4950 static __inline__ __m128d __DEFAULT_FN_ATTRS
4952 {
4953  return (__m128d)__a;
4954 }
4955 
4956 #if defined(__cplusplus)
4957 extern "C" {
4958 #endif
4959 
4960 /// Indicates that a spin loop is being executed for the purposes of
4961 /// optimizing power consumption during the loop.
4962 ///
4963 /// \headerfile <x86intrin.h>
4964 ///
4965 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4966 ///
4967 void _mm_pause(void);
4968 
4969 #if defined(__cplusplus)
4970 } // extern "C"
4971 #endif
4972 #undef __DEFAULT_FN_ATTRS
4973 #undef __DEFAULT_FN_ATTRS_MMX
4974 
4975 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4976 
4977 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4978 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4979 
4980 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4981 
4982 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4983 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4984 
4985 #endif /* __EMMINTRIN_H */
_mm_xor_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:411
_mm_set_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1879
_mm_load1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1601
_mm_cvtepi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1342
_mm_cvtpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1512
_mm_undefined_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1803
_mm_set_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3663
_mm_mul_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2534
__x
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:22
_mm_srli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3137
_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3175
_mm_cmpnle_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:920
_mm_add_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2143
_mm_loadl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3573
_mm_madd_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2356
_mm_unpacklo_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4801
_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
_mm_set_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3613
_mm_add_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2165
_mm_storeu_si32
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4057
_mm_loadu_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3552
_mm_sqrt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:230
_mm_cmpgt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:763
_mm_cmpgt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3277
_mm_subs_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2729
_mm_unpackhi_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4780
__v
struct __storeu_i16 *__P __v
Definition: immintrin.h:373
_mm_comige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1100
_mm_sub_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:102
_mm_srai_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2999
_mm_ucomigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1228
_mm_cmpunord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:845
_mm_storeu_si16
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4078
_mm_div_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:185
_mm_and_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2747
_mm_add_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2205
_mm_set_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3751
_mm_cvtsd_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1404
_mm_unpacklo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4686
_mm_cmpeq_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3194
_mm_set_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1823
_mm_max_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:336
_mm_subs_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2668
_mm_setr_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3866
_mm_sll_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2939
__a
static __inline__ void int __a
Definition: emmintrin.h:4189
_mm_ucomineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1280
_mm_ucomile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1202
_mm_castpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4866
_mm_mul_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:162
_mm_sub_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2647
_mm_set1_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3770
_mm_storel_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2096
__DEFAULT_FN_ATTRS_MMX
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:42
_mm_loadu_si16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1707
_mm_sra_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:3019
_mm_storeu_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:2035
_mm_unpackhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4556
_mm_load_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1728
_mm_undefined_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3591
_mm_store_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1976
_mm_packus_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4335
xmmintrin.h
_mm_srl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3156
_mm_setr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1900
_mm_cmpneq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:870
_mm_comineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1126
_mm_cvtsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1379
__DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:41
_mm_cmpgt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:492
_mm_max_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2396
_mm_cvttps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3449
_mm_maskmoveu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:4108
_mm_cmpeq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:430
_mm_cmple_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:471
_mm_loadh_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1755
_mm_cmpge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:789
_mm_sqrt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:248
_mm_cvtps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3432
_mm_movepi64_pi64
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4724
_mm_min_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2416
_mm_xor_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2802
_mm_andnot_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:375
_mm_set1_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3789
_mm_cmpnge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:971
_mm_unpackhi_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4600
_mm_cvtps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1319
_mm_cvtsi32_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1427
_mm_add_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:60
_mm_setr_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3967
_mm_cmpeq_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3213
_mm_cmplt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:713
_mm_unpacklo_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4635
_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3846
_mm_cmple_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:738
_mm_cvttpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1477
_mm_mulhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2456
_mm_comile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1048
_mm_loadu_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1645
_mm_sra_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2979
_mm_cvtpi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1549
_mm_clflush
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
_mm_cmplt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3298
_mm_move_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4759
_mm_mulhi_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2476
_mm_cmpnle_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:623
_mm_avg_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2330
_mm_load_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3536
_mm_mullo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2496
_mm_cvtsi128_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3500
_mm_cmpeq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:688
_mm_move_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1936
_mm_sub_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2610
_mm_srl_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3080
_mm_cmpnlt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:895
_mm_unpackhi_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4579
_mm_sub_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2629
_mm_sub_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2592
_mm_adds_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2290
_mm_slli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2920
_mm_sll_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2901
_mm_set_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3703
__attribute__
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
_mm_castsi128_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4934
_mm_storeh_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2076
_mm_srli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3099
_mm_unpacklo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4663
_mm_packs_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4307
_mm_cvtepi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3416
_mm_comieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:996
_mm_storeu_si64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4036
_mm_slli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2844
_mm_ucomieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1150
_mm_store_pd1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:2017
_mm_setzero_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3981
_mm_castps_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4917
_mm_adds_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2269
_mm_div_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:205
_mm_or_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:393
_mm_andnot_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2767
_mm_unpacklo_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4707
_mm_cmpunord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:560
_mm_cvtpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1362
_mm_subs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2689
_mm_storeu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:4015
_mm_mul_su32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2515
_mm_stream_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4150
_mm_set1_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3827
_mm_loadl_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1782
_mm_slli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2882
_mm_cmpge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:513
_mm_sll_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2863
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
_mm_cvttpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1532
_mm_min_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:292
_mm_packs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition: emmintrin.h:4279
_mm_sad_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2556
_mm_comigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1074
_mm_comilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1022
_mm_storer_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2058
_mm_setr_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3920
_mm_cvtss_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1453
_mm_srai_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2959
_mm_cmplt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:450
_mm_castps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4900
_mm_movemask_pd
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4820
_mm_min_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:272
_mm_sub_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:121
_mm_cmpneq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:581
_mm_movpi64_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4741
_mm_cmpord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:536
_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4403
_mm_set1_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3808
_mm_set1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1841
_mm_store1_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1996
_mm_add_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:79
_mm_avg_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2310
_mm_or_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2784
_mm_adds_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2226
__b
static __inline__ vector float vector float __b
Definition: altivec.h:566
_mm_mfence
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
_mm_loadu_si64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1665
_mm_cmpnlt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:602
_mm_cmplt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3340
_mm_store_sd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1954
_mm_lfence
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
_mm_add_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2121
_mm_loadr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1627
_mm_cmpgt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3256
_mm_loadu_si32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1686
_mm_sub_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2574
_mm_cmpgt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3233
_mm_unpackhi_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4529
_mm_max_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:316
_mm_cvtpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1299
_mm_ucomilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1176
_mm_store_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3999
_mm_cvtsd_f64
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1566
__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4751
_mm_castpd_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4883
_mm_stream_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4169
_mm_min_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2436
_mm_cmpnge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:665
_mm_subs_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2709
_mm_and_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:354
_mm_mul_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:143
_mm_adds_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2248
_mm_cmplt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3319
_mm_storel_epi64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4127
_mm_set_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3635
_mm_ucomige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1254
_mm_cvttsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1495
_mm_cvtsi32_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3465
_mm_cmpngt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:644
_mm_set_pd1
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1859
_mm_castsi128_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4951
_mm_setzero_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1915
_mm_add_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2183
_mm_srli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3061
_mm_srl_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3118
_mm_max_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2376
_mm_setr_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3889
_mm_cmpngt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:945
_mm_cmpord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:817
_mm_load_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1583