clang  7.0.0svn
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26 
27 #include <xmmintrin.h>
28 
29 typedef double __m128d __attribute__((__vector_size__(16)));
30 typedef long long __m128i __attribute__((__vector_size__(16)));
31 
32 /* Type defines. */
33 typedef double __v2df __attribute__ ((__vector_size__ (16)));
34 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
35 typedef short __v8hi __attribute__((__vector_size__(16)));
36 typedef char __v16qi __attribute__((__vector_size__(16)));
37 
38 /* Unsigned types */
39 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
40 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
41 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
42 
43 /* We need an explicitly signed variant for char. Note that this shouldn't
44  * appear in the interface though. */
45 typedef signed char __v16qs __attribute__((__vector_size__(16)));
46 
47 /* Define the default attributes for the functions in this file. */
48 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
49 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2")))
50 
51 /// Adds lower double-precision values in both operands and returns the
52 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
53 /// are copied from the upper double-precision value of the first operand.
54 ///
55 /// \headerfile <x86intrin.h>
56 ///
57 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
58 ///
59 /// \param __a
60 /// A 128-bit vector of [2 x double] containing one of the source operands.
61 /// \param __b
62 /// A 128-bit vector of [2 x double] containing one of the source operands.
63 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
64 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
65 /// from the upper 64 bits of the first source operand.
66 static __inline__ __m128d __DEFAULT_FN_ATTRS
67 _mm_add_sd(__m128d __a, __m128d __b)
68 {
69  __a[0] += __b[0];
70  return __a;
71 }
72 
73 /// Adds two 128-bit vectors of [2 x double].
74 ///
75 /// \headerfile <x86intrin.h>
76 ///
77 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
78 ///
79 /// \param __a
80 /// A 128-bit vector of [2 x double] containing one of the source operands.
81 /// \param __b
82 /// A 128-bit vector of [2 x double] containing one of the source operands.
83 /// \returns A 128-bit vector of [2 x double] containing the sums of both
84 /// operands.
85 static __inline__ __m128d __DEFAULT_FN_ATTRS
86 _mm_add_pd(__m128d __a, __m128d __b)
87 {
88  return (__m128d)((__v2df)__a + (__v2df)__b);
89 }
90 
91 /// Subtracts the lower double-precision value of the second operand
92 /// from the lower double-precision value of the first operand and returns
93 /// the difference in the lower 64 bits of the result. The upper 64 bits of
94 /// the result are copied from the upper double-precision value of the first
95 /// operand.
96 ///
97 /// \headerfile <x86intrin.h>
98 ///
99 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
100 ///
101 /// \param __a
102 /// A 128-bit vector of [2 x double] containing the minuend.
103 /// \param __b
104 /// A 128-bit vector of [2 x double] containing the subtrahend.
105 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
106 /// difference of the lower 64 bits of both operands. The upper 64 bits are
107 /// copied from the upper 64 bits of the first source operand.
108 static __inline__ __m128d __DEFAULT_FN_ATTRS
109 _mm_sub_sd(__m128d __a, __m128d __b)
110 {
111  __a[0] -= __b[0];
112  return __a;
113 }
114 
115 /// Subtracts two 128-bit vectors of [2 x double].
116 ///
117 /// \headerfile <x86intrin.h>
118 ///
119 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
120 ///
121 /// \param __a
122 /// A 128-bit vector of [2 x double] containing the minuend.
123 /// \param __b
124 /// A 128-bit vector of [2 x double] containing the subtrahend.
125 /// \returns A 128-bit vector of [2 x double] containing the differences between
126 /// both operands.
127 static __inline__ __m128d __DEFAULT_FN_ATTRS
128 _mm_sub_pd(__m128d __a, __m128d __b)
129 {
130  return (__m128d)((__v2df)__a - (__v2df)__b);
131 }
132 
133 /// Multiplies lower double-precision values in both operands and returns
134 /// the product in the lower 64 bits of the result. The upper 64 bits of the
135 /// result are copied from the upper double-precision value of the first
136 /// operand.
137 ///
138 /// \headerfile <x86intrin.h>
139 ///
140 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
141 ///
142 /// \param __a
143 /// A 128-bit vector of [2 x double] containing one of the source operands.
144 /// \param __b
145 /// A 128-bit vector of [2 x double] containing one of the source operands.
146 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
147 /// product of the lower 64 bits of both operands. The upper 64 bits are
148 /// copied from the upper 64 bits of the first source operand.
149 static __inline__ __m128d __DEFAULT_FN_ATTRS
150 _mm_mul_sd(__m128d __a, __m128d __b)
151 {
152  __a[0] *= __b[0];
153  return __a;
154 }
155 
156 /// Multiplies two 128-bit vectors of [2 x double].
157 ///
158 /// \headerfile <x86intrin.h>
159 ///
160 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
161 ///
162 /// \param __a
163 /// A 128-bit vector of [2 x double] containing one of the operands.
164 /// \param __b
165 /// A 128-bit vector of [2 x double] containing one of the operands.
166 /// \returns A 128-bit vector of [2 x double] containing the products of both
167 /// operands.
168 static __inline__ __m128d __DEFAULT_FN_ATTRS
169 _mm_mul_pd(__m128d __a, __m128d __b)
170 {
171  return (__m128d)((__v2df)__a * (__v2df)__b);
172 }
173 
174 /// Divides the lower double-precision value of the first operand by the
175 /// lower double-precision value of the second operand and returns the
176 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
177 /// result are copied from the upper double-precision value of the first
178 /// operand.
179 ///
180 /// \headerfile <x86intrin.h>
181 ///
182 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
183 ///
184 /// \param __a
185 /// A 128-bit vector of [2 x double] containing the dividend.
186 /// \param __b
187 /// A 128-bit vector of [2 x double] containing divisor.
188 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
189 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
190 /// copied from the upper 64 bits of the first source operand.
191 static __inline__ __m128d __DEFAULT_FN_ATTRS
192 _mm_div_sd(__m128d __a, __m128d __b)
193 {
194  __a[0] /= __b[0];
195  return __a;
196 }
197 
198 /// Performs an element-by-element division of two 128-bit vectors of
199 /// [2 x double].
200 ///
201 /// \headerfile <x86intrin.h>
202 ///
203 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
204 ///
205 /// \param __a
206 /// A 128-bit vector of [2 x double] containing the dividend.
207 /// \param __b
208 /// A 128-bit vector of [2 x double] containing the divisor.
209 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
210 /// operands.
211 static __inline__ __m128d __DEFAULT_FN_ATTRS
212 _mm_div_pd(__m128d __a, __m128d __b)
213 {
214  return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216 
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
229 /// result.
230 /// \param __b
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS
237 _mm_sqrt_sd(__m128d __a, __m128d __b)
238 {
239  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
240  return __extension__ (__m128d) { __c[0], __a[1] };
241 }
242 
243 /// Calculates the square root of the each of two values stored in a
244 /// 128-bit vector of [2 x double].
245 ///
246 /// \headerfile <x86intrin.h>
247 ///
248 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
249 ///
250 /// \param __a
251 /// A 128-bit vector of [2 x double].
252 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
253 /// values in the operand.
254 static __inline__ __m128d __DEFAULT_FN_ATTRS
255 _mm_sqrt_pd(__m128d __a)
256 {
257  return __builtin_ia32_sqrtpd((__v2df)__a);
258 }
259 
260 /// Compares lower 64-bit double-precision values of both operands, and
261 /// returns the lesser of the pair of values in the lower 64-bits of the
262 /// result. The upper 64 bits of the result are copied from the upper
263 /// double-precision value of the first operand.
264 ///
265 /// \headerfile <x86intrin.h>
266 ///
267 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
268 ///
269 /// \param __a
270 /// A 128-bit vector of [2 x double] containing one of the operands. The
271 /// lower 64 bits of this operand are used in the comparison.
272 /// \param __b
273 /// A 128-bit vector of [2 x double] containing one of the operands. The
274 /// lower 64 bits of this operand are used in the comparison.
275 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
276 /// minimum value between both operands. The upper 64 bits are copied from
277 /// the upper 64 bits of the first source operand.
278 static __inline__ __m128d __DEFAULT_FN_ATTRS
279 _mm_min_sd(__m128d __a, __m128d __b)
280 {
281  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
282 }
283 
284 /// Performs element-by-element comparison of the two 128-bit vectors of
285 /// [2 x double] and returns the vector containing the lesser of each pair of
286 /// values.
287 ///
288 /// \headerfile <x86intrin.h>
289 ///
290 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
291 ///
292 /// \param __a
293 /// A 128-bit vector of [2 x double] containing one of the operands.
294 /// \param __b
295 /// A 128-bit vector of [2 x double] containing one of the operands.
296 /// \returns A 128-bit vector of [2 x double] containing the minimum values
297 /// between both operands.
298 static __inline__ __m128d __DEFAULT_FN_ATTRS
299 _mm_min_pd(__m128d __a, __m128d __b)
300 {
301  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
302 }
303 
304 /// Compares lower 64-bit double-precision values of both operands, and
305 /// returns the greater of the pair of values in the lower 64-bits of the
306 /// result. The upper 64 bits of the result are copied from the upper
307 /// double-precision value of the first operand.
308 ///
309 /// \headerfile <x86intrin.h>
310 ///
311 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
312 ///
313 /// \param __a
314 /// A 128-bit vector of [2 x double] containing one of the operands. The
315 /// lower 64 bits of this operand are used in the comparison.
316 /// \param __b
317 /// A 128-bit vector of [2 x double] containing one of the operands. The
318 /// lower 64 bits of this operand are used in the comparison.
319 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
320 /// maximum value between both operands. The upper 64 bits are copied from
321 /// the upper 64 bits of the first source operand.
322 static __inline__ __m128d __DEFAULT_FN_ATTRS
323 _mm_max_sd(__m128d __a, __m128d __b)
324 {
325  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
326 }
327 
328 /// Performs element-by-element comparison of the two 128-bit vectors of
329 /// [2 x double] and returns the vector containing the greater of each pair
330 /// of values.
331 ///
332 /// \headerfile <x86intrin.h>
333 ///
334 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
335 ///
336 /// \param __a
337 /// A 128-bit vector of [2 x double] containing one of the operands.
338 /// \param __b
339 /// A 128-bit vector of [2 x double] containing one of the operands.
340 /// \returns A 128-bit vector of [2 x double] containing the maximum values
341 /// between both operands.
342 static __inline__ __m128d __DEFAULT_FN_ATTRS
343 _mm_max_pd(__m128d __a, __m128d __b)
344 {
345  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
346 }
347 
348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
349 ///
350 /// \headerfile <x86intrin.h>
351 ///
352 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
353 ///
354 /// \param __a
355 /// A 128-bit vector of [2 x double] containing one of the source operands.
356 /// \param __b
357 /// A 128-bit vector of [2 x double] containing one of the source operands.
358 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
359 /// values between both operands.
360 static __inline__ __m128d __DEFAULT_FN_ATTRS
361 _mm_and_pd(__m128d __a, __m128d __b)
362 {
363  return (__m128d)((__v2du)__a & (__v2du)__b);
364 }
365 
366 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
367 /// the one's complement of the values contained in the first source operand.
368 ///
369 /// \headerfile <x86intrin.h>
370 ///
371 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
372 ///
373 /// \param __a
374 /// A 128-bit vector of [2 x double] containing the left source operand. The
375 /// one's complement of this value is used in the bitwise AND.
376 /// \param __b
377 /// A 128-bit vector of [2 x double] containing the right source operand.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
379 /// values in the second operand and the one's complement of the first
380 /// operand.
381 static __inline__ __m128d __DEFAULT_FN_ATTRS
382 _mm_andnot_pd(__m128d __a, __m128d __b)
383 {
384  return (__m128d)(~(__v2du)__a & (__v2du)__b);
385 }
386 
387 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
388 ///
389 /// \headerfile <x86intrin.h>
390 ///
391 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
392 ///
393 /// \param __a
394 /// A 128-bit vector of [2 x double] containing one of the source operands.
395 /// \param __b
396 /// A 128-bit vector of [2 x double] containing one of the source operands.
397 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
398 /// values between both operands.
399 static __inline__ __m128d __DEFAULT_FN_ATTRS
400 _mm_or_pd(__m128d __a, __m128d __b)
401 {
402  return (__m128d)((__v2du)__a | (__v2du)__b);
403 }
404 
405 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
406 ///
407 /// \headerfile <x86intrin.h>
408 ///
409 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
410 ///
411 /// \param __a
412 /// A 128-bit vector of [2 x double] containing one of the source operands.
413 /// \param __b
414 /// A 128-bit vector of [2 x double] containing one of the source operands.
415 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
416 /// values between both operands.
417 static __inline__ __m128d __DEFAULT_FN_ATTRS
418 _mm_xor_pd(__m128d __a, __m128d __b)
419 {
420  return (__m128d)((__v2du)__a ^ (__v2du)__b);
421 }
422 
423 /// Compares each of the corresponding double-precision values of the
424 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
425 /// for false, 0xFFFFFFFFFFFFFFFF for true.
426 ///
427 /// \headerfile <x86intrin.h>
428 ///
429 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
430 ///
431 /// \param __a
432 /// A 128-bit vector of [2 x double].
433 /// \param __b
434 /// A 128-bit vector of [2 x double].
435 /// \returns A 128-bit vector containing the comparison results.
436 static __inline__ __m128d __DEFAULT_FN_ATTRS
437 _mm_cmpeq_pd(__m128d __a, __m128d __b)
438 {
439  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
440 }
441 
442 /// Compares each of the corresponding double-precision values of the
443 /// 128-bit vectors of [2 x double] to determine if the values in the first
444 /// operand are less than those in the second operand. Each comparison
445 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
446 ///
447 /// \headerfile <x86intrin.h>
448 ///
449 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
450 ///
451 /// \param __a
452 /// A 128-bit vector of [2 x double].
453 /// \param __b
454 /// A 128-bit vector of [2 x double].
455 /// \returns A 128-bit vector containing the comparison results.
456 static __inline__ __m128d __DEFAULT_FN_ATTRS
457 _mm_cmplt_pd(__m128d __a, __m128d __b)
458 {
459  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
460 }
461 
462 /// Compares each of the corresponding double-precision values of the
463 /// 128-bit vectors of [2 x double] to determine if the values in the first
464 /// operand are less than or equal to those in the second operand.
465 ///
466 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
471 ///
472 /// \param __a
473 /// A 128-bit vector of [2 x double].
474 /// \param __b
475 /// A 128-bit vector of [2 x double].
476 /// \returns A 128-bit vector containing the comparison results.
477 static __inline__ __m128d __DEFAULT_FN_ATTRS
478 _mm_cmple_pd(__m128d __a, __m128d __b)
479 {
480  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
481 }
482 
483 /// Compares each of the corresponding double-precision values of the
484 /// 128-bit vectors of [2 x double] to determine if the values in the first
485 /// operand are greater than those in the second operand.
486 ///
487 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
488 ///
489 /// \headerfile <x86intrin.h>
490 ///
491 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
492 ///
493 /// \param __a
494 /// A 128-bit vector of [2 x double].
495 /// \param __b
496 /// A 128-bit vector of [2 x double].
497 /// \returns A 128-bit vector containing the comparison results.
498 static __inline__ __m128d __DEFAULT_FN_ATTRS
499 _mm_cmpgt_pd(__m128d __a, __m128d __b)
500 {
501  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
502 }
503 
504 /// Compares each of the corresponding double-precision values of the
505 /// 128-bit vectors of [2 x double] to determine if the values in the first
506 /// operand are greater than or equal to those in the second operand.
507 ///
508 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
509 ///
510 /// \headerfile <x86intrin.h>
511 ///
512 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
513 ///
514 /// \param __a
515 /// A 128-bit vector of [2 x double].
516 /// \param __b
517 /// A 128-bit vector of [2 x double].
518 /// \returns A 128-bit vector containing the comparison results.
519 static __inline__ __m128d __DEFAULT_FN_ATTRS
520 _mm_cmpge_pd(__m128d __a, __m128d __b)
521 {
522  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
523 }
524 
525 /// Compares each of the corresponding double-precision values of the
526 /// 128-bit vectors of [2 x double] to determine if the values in the first
527 /// operand are ordered with respect to those in the second operand.
528 ///
529 /// A pair of double-precision values are "ordered" with respect to each
530 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
531 /// 0xFFFFFFFFFFFFFFFF for true.
532 ///
533 /// \headerfile <x86intrin.h>
534 ///
535 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
536 ///
537 /// \param __a
538 /// A 128-bit vector of [2 x double].
539 /// \param __b
540 /// A 128-bit vector of [2 x double].
541 /// \returns A 128-bit vector containing the comparison results.
542 static __inline__ __m128d __DEFAULT_FN_ATTRS
543 _mm_cmpord_pd(__m128d __a, __m128d __b)
544 {
545  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
546 }
547 
548 /// Compares each of the corresponding double-precision values of the
549 /// 128-bit vectors of [2 x double] to determine if the values in the first
550 /// operand are unordered with respect to those in the second operand.
551 ///
552 /// A pair of double-precision values are "unordered" with respect to each
553 /// other if one or both values are NaN. Each comparison yields 0x0 for
554 /// false, 0xFFFFFFFFFFFFFFFF for true.
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
559 /// instruction.
560 ///
561 /// \param __a
562 /// A 128-bit vector of [2 x double].
563 /// \param __b
564 /// A 128-bit vector of [2 x double].
565 /// \returns A 128-bit vector containing the comparison results.
566 static __inline__ __m128d __DEFAULT_FN_ATTRS
567 _mm_cmpunord_pd(__m128d __a, __m128d __b)
568 {
569  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
570 }
571 
572 /// Compares each of the corresponding double-precision values of the
573 /// 128-bit vectors of [2 x double] to determine if the values in the first
574 /// operand are unequal to those in the second operand.
575 ///
576 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
581 ///
582 /// \param __a
583 /// A 128-bit vector of [2 x double].
584 /// \param __b
585 /// A 128-bit vector of [2 x double].
586 /// \returns A 128-bit vector containing the comparison results.
587 static __inline__ __m128d __DEFAULT_FN_ATTRS
588 _mm_cmpneq_pd(__m128d __a, __m128d __b)
589 {
590  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
591 }
592 
593 /// Compares each of the corresponding double-precision values of the
594 /// 128-bit vectors of [2 x double] to determine if the values in the first
595 /// operand are not less than those in the second operand.
596 ///
597 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
598 ///
599 /// \headerfile <x86intrin.h>
600 ///
601 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
602 ///
603 /// \param __a
604 /// A 128-bit vector of [2 x double].
605 /// \param __b
606 /// A 128-bit vector of [2 x double].
607 /// \returns A 128-bit vector containing the comparison results.
608 static __inline__ __m128d __DEFAULT_FN_ATTRS
609 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
610 {
611  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
612 }
613 
614 /// Compares each of the corresponding double-precision values of the
615 /// 128-bit vectors of [2 x double] to determine if the values in the first
616 /// operand are not less than or equal to those in the second operand.
617 ///
618 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619 ///
620 /// \headerfile <x86intrin.h>
621 ///
622 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
623 ///
624 /// \param __a
625 /// A 128-bit vector of [2 x double].
626 /// \param __b
627 /// A 128-bit vector of [2 x double].
628 /// \returns A 128-bit vector containing the comparison results.
629 static __inline__ __m128d __DEFAULT_FN_ATTRS
630 _mm_cmpnle_pd(__m128d __a, __m128d __b)
631 {
632  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
633 }
634 
635 /// Compares each of the corresponding double-precision values of the
636 /// 128-bit vectors of [2 x double] to determine if the values in the first
637 /// operand are not greater than those in the second operand.
638 ///
639 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
640 ///
641 /// \headerfile <x86intrin.h>
642 ///
643 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
644 ///
645 /// \param __a
646 /// A 128-bit vector of [2 x double].
647 /// \param __b
648 /// A 128-bit vector of [2 x double].
649 /// \returns A 128-bit vector containing the comparison results.
650 static __inline__ __m128d __DEFAULT_FN_ATTRS
651 _mm_cmpngt_pd(__m128d __a, __m128d __b)
652 {
653  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
654 }
655 
656 /// Compares each of the corresponding double-precision values of the
657 /// 128-bit vectors of [2 x double] to determine if the values in the first
658 /// operand are not greater than or equal to those in the second operand.
659 ///
660 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
661 ///
662 /// \headerfile <x86intrin.h>
663 ///
664 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
665 ///
666 /// \param __a
667 /// A 128-bit vector of [2 x double].
668 /// \param __b
669 /// A 128-bit vector of [2 x double].
670 /// \returns A 128-bit vector containing the comparison results.
671 static __inline__ __m128d __DEFAULT_FN_ATTRS
672 _mm_cmpnge_pd(__m128d __a, __m128d __b)
673 {
674  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
675 }
676 
677 /// Compares the lower double-precision floating-point values in each of
678 /// the two 128-bit floating-point vectors of [2 x double] for equality.
679 ///
680 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
681 ///
682 /// \headerfile <x86intrin.h>
683 ///
684 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
685 ///
686 /// \param __a
687 /// A 128-bit vector of [2 x double]. The lower double-precision value is
688 /// compared to the lower double-precision value of \a __b.
689 /// \param __b
690 /// A 128-bit vector of [2 x double]. The lower double-precision value is
691 /// compared to the lower double-precision value of \a __a.
692 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
693 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
694 static __inline__ __m128d __DEFAULT_FN_ATTRS
695 _mm_cmpeq_sd(__m128d __a, __m128d __b)
696 {
697  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
698 }
699 
700 /// Compares the lower double-precision floating-point values in each of
701 /// the two 128-bit floating-point vectors of [2 x double] to determine if
702 /// the value in the first parameter is less than the corresponding value in
703 /// the second parameter.
704 ///
705 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706 ///
707 /// \headerfile <x86intrin.h>
708 ///
709 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
710 ///
711 /// \param __a
712 /// A 128-bit vector of [2 x double]. The lower double-precision value is
713 /// compared to the lower double-precision value of \a __b.
714 /// \param __b
715 /// A 128-bit vector of [2 x double]. The lower double-precision value is
716 /// compared to the lower double-precision value of \a __a.
717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
718 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719 static __inline__ __m128d __DEFAULT_FN_ATTRS
720 _mm_cmplt_sd(__m128d __a, __m128d __b)
721 {
722  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
723 }
724 
725 /// Compares the lower double-precision floating-point values in each of
726 /// the two 128-bit floating-point vectors of [2 x double] to determine if
727 /// the value in the first parameter is less than or equal to the
728 /// corresponding value in the second parameter.
729 ///
730 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
731 ///
732 /// \headerfile <x86intrin.h>
733 ///
734 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
735 ///
736 /// \param __a
737 /// A 128-bit vector of [2 x double]. The lower double-precision value is
738 /// compared to the lower double-precision value of \a __b.
739 /// \param __b
740 /// A 128-bit vector of [2 x double]. The lower double-precision value is
741 /// compared to the lower double-precision value of \a __a.
742 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
743 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
744 static __inline__ __m128d __DEFAULT_FN_ATTRS
745 _mm_cmple_sd(__m128d __a, __m128d __b)
746 {
747  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
748 }
749 
750 /// Compares the lower double-precision floating-point values in each of
751 /// the two 128-bit floating-point vectors of [2 x double] to determine if
752 /// the value in the first parameter is greater than the corresponding value
753 /// in the second parameter.
754 ///
755 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
756 ///
757 /// \headerfile <x86intrin.h>
758 ///
759 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
760 ///
761 /// \param __a
762 /// A 128-bit vector of [2 x double]. The lower double-precision value is
763 /// compared to the lower double-precision value of \a __b.
764 /// \param __b
765 /// A 128-bit vector of [2 x double]. The lower double-precision value is
766 /// compared to the lower double-precision value of \a __a.
767 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
768 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
769 static __inline__ __m128d __DEFAULT_FN_ATTRS
770 _mm_cmpgt_sd(__m128d __a, __m128d __b)
771 {
772  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
773  return __extension__ (__m128d) { __c[0], __a[1] };
774 }
775 
776 /// Compares the lower double-precision floating-point values in each of
777 /// the two 128-bit floating-point vectors of [2 x double] to determine if
778 /// the value in the first parameter is greater than or equal to the
779 /// corresponding value in the second parameter.
780 ///
781 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
782 ///
783 /// \headerfile <x86intrin.h>
784 ///
785 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
786 ///
787 /// \param __a
788 /// A 128-bit vector of [2 x double]. The lower double-precision value is
789 /// compared to the lower double-precision value of \a __b.
790 /// \param __b
791 /// A 128-bit vector of [2 x double]. The lower double-precision value is
792 /// compared to the lower double-precision value of \a __a.
793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
794 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795 static __inline__ __m128d __DEFAULT_FN_ATTRS
796 _mm_cmpge_sd(__m128d __a, __m128d __b)
797 {
798  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
799  return __extension__ (__m128d) { __c[0], __a[1] };
800 }
801 
802 /// Compares the lower double-precision floating-point values in each of
803 /// the two 128-bit floating-point vectors of [2 x double] to determine if
804 /// the value in the first parameter is "ordered" with respect to the
805 /// corresponding value in the second parameter.
806 ///
807 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
808 /// of double-precision values are "ordered" with respect to each other if
809 /// neither value is a NaN.
810 ///
811 /// \headerfile <x86intrin.h>
812 ///
813 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
814 ///
815 /// \param __a
816 /// A 128-bit vector of [2 x double]. The lower double-precision value is
817 /// compared to the lower double-precision value of \a __b.
818 /// \param __b
819 /// A 128-bit vector of [2 x double]. The lower double-precision value is
820 /// compared to the lower double-precision value of \a __a.
821 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
822 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
823 static __inline__ __m128d __DEFAULT_FN_ATTRS
824 _mm_cmpord_sd(__m128d __a, __m128d __b)
825 {
826  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
827 }
828 
829 /// Compares the lower double-precision floating-point values in each of
830 /// the two 128-bit floating-point vectors of [2 x double] to determine if
831 /// the value in the first parameter is "unordered" with respect to the
832 /// corresponding value in the second parameter.
833 ///
834 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
835 /// of double-precision values are "unordered" with respect to each other if
836 /// one or both values are NaN.
837 ///
838 /// \headerfile <x86intrin.h>
839 ///
840 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
841 /// instruction.
842 ///
843 /// \param __a
844 /// A 128-bit vector of [2 x double]. The lower double-precision value is
845 /// compared to the lower double-precision value of \a __b.
846 /// \param __b
847 /// A 128-bit vector of [2 x double]. The lower double-precision value is
848 /// compared to the lower double-precision value of \a __a.
849 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
850 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
851 static __inline__ __m128d __DEFAULT_FN_ATTRS
852 _mm_cmpunord_sd(__m128d __a, __m128d __b)
853 {
854  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
855 }
856 
857 /// Compares the lower double-precision floating-point values in each of
858 /// the two 128-bit floating-point vectors of [2 x double] to determine if
859 /// the value in the first parameter is unequal to the corresponding value in
860 /// the second parameter.
861 ///
862 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
863 ///
864 /// \headerfile <x86intrin.h>
865 ///
866 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
867 ///
868 /// \param __a
869 /// A 128-bit vector of [2 x double]. The lower double-precision value is
870 /// compared to the lower double-precision value of \a __b.
871 /// \param __b
872 /// A 128-bit vector of [2 x double]. The lower double-precision value is
873 /// compared to the lower double-precision value of \a __a.
874 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
875 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
876 static __inline__ __m128d __DEFAULT_FN_ATTRS
877 _mm_cmpneq_sd(__m128d __a, __m128d __b)
878 {
879  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
880 }
881 
882 /// Compares the lower double-precision floating-point values in each of
883 /// the two 128-bit floating-point vectors of [2 x double] to determine if
884 /// the value in the first parameter is not less than the corresponding
885 /// value in the second parameter.
886 ///
887 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
888 ///
889 /// \headerfile <x86intrin.h>
890 ///
891 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
892 ///
893 /// \param __a
894 /// A 128-bit vector of [2 x double]. The lower double-precision value is
895 /// compared to the lower double-precision value of \a __b.
896 /// \param __b
897 /// A 128-bit vector of [2 x double]. The lower double-precision value is
898 /// compared to the lower double-precision value of \a __a.
899 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
900 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
901 static __inline__ __m128d __DEFAULT_FN_ATTRS
902 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
903 {
904  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
905 }
906 
907 /// Compares the lower double-precision floating-point values in each of
908 /// the two 128-bit floating-point vectors of [2 x double] to determine if
909 /// the value in the first parameter is not less than or equal to the
910 /// corresponding value in the second parameter.
911 ///
912 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
913 ///
914 /// \headerfile <x86intrin.h>
915 ///
916 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
917 ///
918 /// \param __a
919 /// A 128-bit vector of [2 x double]. The lower double-precision value is
920 /// compared to the lower double-precision value of \a __b.
921 /// \param __b
922 /// A 128-bit vector of [2 x double]. The lower double-precision value is
923 /// compared to the lower double-precision value of \a __a.
924 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
925 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
926 static __inline__ __m128d __DEFAULT_FN_ATTRS
927 _mm_cmpnle_sd(__m128d __a, __m128d __b)
928 {
929  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
930 }
931 
932 /// Compares the lower double-precision floating-point values in each of
933 /// the two 128-bit floating-point vectors of [2 x double] to determine if
934 /// the value in the first parameter is not greater than the corresponding
935 /// value in the second parameter.
936 ///
937 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
938 ///
939 /// \headerfile <x86intrin.h>
940 ///
941 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
942 ///
943 /// \param __a
944 /// A 128-bit vector of [2 x double]. The lower double-precision value is
945 /// compared to the lower double-precision value of \a __b.
946 /// \param __b
947 /// A 128-bit vector of [2 x double]. The lower double-precision value is
948 /// compared to the lower double-precision value of \a __a.
949 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
950 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
951 static __inline__ __m128d __DEFAULT_FN_ATTRS
952 _mm_cmpngt_sd(__m128d __a, __m128d __b)
953 {
954  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
955  return __extension__ (__m128d) { __c[0], __a[1] };
956 }
957 
958 /// Compares the lower double-precision floating-point values in each of
959 /// the two 128-bit floating-point vectors of [2 x double] to determine if
960 /// the value in the first parameter is not greater than or equal to the
961 /// corresponding value in the second parameter.
962 ///
963 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
964 ///
965 /// \headerfile <x86intrin.h>
966 ///
967 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
968 ///
969 /// \param __a
970 /// A 128-bit vector of [2 x double]. The lower double-precision value is
971 /// compared to the lower double-precision value of \a __b.
972 /// \param __b
973 /// A 128-bit vector of [2 x double]. The lower double-precision value is
974 /// compared to the lower double-precision value of \a __a.
975 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
976 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
977 static __inline__ __m128d __DEFAULT_FN_ATTRS
978 _mm_cmpnge_sd(__m128d __a, __m128d __b)
979 {
980  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
981  return __extension__ (__m128d) { __c[0], __a[1] };
982 }
983 
984 /// Compares the lower double-precision floating-point values in each of
985 /// the two 128-bit floating-point vectors of [2 x double] for equality.
986 ///
987 /// The comparison yields 0 for false, 1 for true. If either of the two
988 /// lower double-precision values is NaN, 0 is returned.
989 ///
990 /// \headerfile <x86intrin.h>
991 ///
992 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
993 ///
994 /// \param __a
995 /// A 128-bit vector of [2 x double]. The lower double-precision value is
996 /// compared to the lower double-precision value of \a __b.
997 /// \param __b
998 /// A 128-bit vector of [2 x double]. The lower double-precision value is
999 /// compared to the lower double-precision value of \a __a.
1000 /// \returns An integer containing the comparison results. If either of the two
1001 /// lower double-precision values is NaN, 0 is returned.
1002 static __inline__ int __DEFAULT_FN_ATTRS
1003 _mm_comieq_sd(__m128d __a, __m128d __b)
1004 {
1005  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1006 }
1007 
1008 /// Compares the lower double-precision floating-point values in each of
1009 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1010 /// the value in the first parameter is less than the corresponding value in
1011 /// the second parameter.
1012 ///
1013 /// The comparison yields 0 for false, 1 for true. If either of the two
1014 /// lower double-precision values is NaN, 0 is returned.
1015 ///
1016 /// \headerfile <x86intrin.h>
1017 ///
1018 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1019 ///
1020 /// \param __a
1021 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1022 /// compared to the lower double-precision value of \a __b.
1023 /// \param __b
1024 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1025 /// compared to the lower double-precision value of \a __a.
1026 /// \returns An integer containing the comparison results. If either of the two
1027 /// lower double-precision values is NaN, 0 is returned.
1028 static __inline__ int __DEFAULT_FN_ATTRS
1029 _mm_comilt_sd(__m128d __a, __m128d __b)
1030 {
1031  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1032 }
1033 
1034 /// Compares the lower double-precision floating-point values in each of
1035 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1036 /// the value in the first parameter is less than or equal to the
1037 /// corresponding value in the second parameter.
1038 ///
1039 /// The comparison yields 0 for false, 1 for true. If either of the two
1040 /// lower double-precision values is NaN, 0 is returned.
1041 ///
1042 /// \headerfile <x86intrin.h>
1043 ///
1044 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1045 ///
1046 /// \param __a
1047 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1048 /// compared to the lower double-precision value of \a __b.
1049 /// \param __b
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __a.
1052 /// \returns An integer containing the comparison results. If either of the two
1053 /// lower double-precision values is NaN, 0 is returned.
1054 static __inline__ int __DEFAULT_FN_ATTRS
1055 _mm_comile_sd(__m128d __a, __m128d __b)
1056 {
1057  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1058 }
1059 
1060 /// Compares the lower double-precision floating-point values in each of
1061 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1062 /// the value in the first parameter is greater than the corresponding value
1063 /// in the second parameter.
1064 ///
1065 /// The comparison yields 0 for false, 1 for true. If either of the two
1066 /// lower double-precision values is NaN, 0 is returned.
1067 ///
1068 /// \headerfile <x86intrin.h>
1069 ///
1070 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1071 ///
1072 /// \param __a
1073 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1074 /// compared to the lower double-precision value of \a __b.
1075 /// \param __b
1076 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1077 /// compared to the lower double-precision value of \a __a.
1078 /// \returns An integer containing the comparison results. If either of the two
1079 /// lower double-precision values is NaN, 0 is returned.
1080 static __inline__ int __DEFAULT_FN_ATTRS
1081 _mm_comigt_sd(__m128d __a, __m128d __b)
1082 {
1083  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1084 }
1085 
1086 /// Compares the lower double-precision floating-point values in each of
1087 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1088 /// the value in the first parameter is greater than or equal to the
1089 /// corresponding value in the second parameter.
1090 ///
1091 /// The comparison yields 0 for false, 1 for true. If either of the two
1092 /// lower double-precision values is NaN, 0 is returned.
1093 ///
1094 /// \headerfile <x86intrin.h>
1095 ///
1096 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1097 ///
1098 /// \param __a
1099 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1100 /// compared to the lower double-precision value of \a __b.
1101 /// \param __b
1102 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1103 /// compared to the lower double-precision value of \a __a.
1104 /// \returns An integer containing the comparison results. If either of the two
1105 /// lower double-precision values is NaN, 0 is returned.
1106 static __inline__ int __DEFAULT_FN_ATTRS
1107 _mm_comige_sd(__m128d __a, __m128d __b)
1108 {
1109  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1110 }
1111 
1112 /// Compares the lower double-precision floating-point values in each of
1113 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1114 /// the value in the first parameter is unequal to the corresponding value in
1115 /// the second parameter.
1116 ///
1117 /// The comparison yields 0 for false, 1 for true. If either of the two
1118 /// lower double-precision values is NaN, 1 is returned.
1119 ///
1120 /// \headerfile <x86intrin.h>
1121 ///
1122 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1123 ///
1124 /// \param __a
1125 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1126 /// compared to the lower double-precision value of \a __b.
1127 /// \param __b
1128 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1129 /// compared to the lower double-precision value of \a __a.
1130 /// \returns An integer containing the comparison results. If either of the two
1131 /// lower double-precision values is NaN, 1 is returned.
1132 static __inline__ int __DEFAULT_FN_ATTRS
1133 _mm_comineq_sd(__m128d __a, __m128d __b)
1134 {
1135  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1136 }
1137 
1138 /// Compares the lower double-precision floating-point values in each of
1139 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1140 /// comparison yields 0 for false, 1 for true.
1141 ///
1142 /// If either of the two lower double-precision values is NaN, 0 is returned.
1143 ///
1144 /// \headerfile <x86intrin.h>
1145 ///
1146 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1147 ///
1148 /// \param __a
1149 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1150 /// compared to the lower double-precision value of \a __b.
1151 /// \param __b
1152 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1153 /// compared to the lower double-precision value of \a __a.
1154 /// \returns An integer containing the comparison results. If either of the two
1155 /// lower double-precision values is NaN, 0 is returned.
1156 static __inline__ int __DEFAULT_FN_ATTRS
1157 _mm_ucomieq_sd(__m128d __a, __m128d __b)
1158 {
1159  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1160 }
1161 
1162 /// Compares the lower double-precision floating-point values in each of
1163 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1164 /// the value in the first parameter is less than the corresponding value in
1165 /// the second parameter.
1166 ///
1167 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1168 /// double-precision values is NaN, 0 is returned.
1169 ///
1170 /// \headerfile <x86intrin.h>
1171 ///
1172 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1173 ///
1174 /// \param __a
1175 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1176 /// compared to the lower double-precision value of \a __b.
1177 /// \param __b
1178 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1179 /// compared to the lower double-precision value of \a __a.
1180 /// \returns An integer containing the comparison results. If either of the two
1181 /// lower double-precision values is NaN, 0 is returned.
1182 static __inline__ int __DEFAULT_FN_ATTRS
1183 _mm_ucomilt_sd(__m128d __a, __m128d __b)
1184 {
1185  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1186 }
1187 
1188 /// Compares the lower double-precision floating-point values in each of
1189 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1190 /// the value in the first parameter is less than or equal to the
1191 /// corresponding value in the second parameter.
1192 ///
1193 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1194 /// double-precision values is NaN, 0 is returned.
1195 ///
1196 /// \headerfile <x86intrin.h>
1197 ///
1198 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1199 ///
1200 /// \param __a
1201 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1202 /// compared to the lower double-precision value of \a __b.
1203 /// \param __b
1204 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1205 /// compared to the lower double-precision value of \a __a.
1206 /// \returns An integer containing the comparison results. If either of the two
1207 /// lower double-precision values is NaN, 0 is returned.
1208 static __inline__ int __DEFAULT_FN_ATTRS
1209 _mm_ucomile_sd(__m128d __a, __m128d __b)
1210 {
1211  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1212 }
1213 
1214 /// Compares the lower double-precision floating-point values in each of
1215 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1216 /// the value in the first parameter is greater than the corresponding value
1217 /// in the second parameter.
1218 ///
1219 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1220 /// double-precision values is NaN, 0 is returned.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1225 ///
1226 /// \param __a
1227 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1228 /// compared to the lower double-precision value of \a __b.
1229 /// \param __b
1230 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1231 /// compared to the lower double-precision value of \a __a.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 /// lower double-precision values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
1235 _mm_ucomigt_sd(__m128d __a, __m128d __b)
1236 {
1237  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1238 }
1239 
1240 /// Compares the lower double-precision floating-point values in each of
1241 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1242 /// the value in the first parameter is greater than or equal to the
1243 /// corresponding value in the second parameter.
1244 ///
1245 /// The comparison yields 0 for false, 1 for true. If either of the two
1246 /// lower double-precision values is NaN, 0 is returned.
1247 ///
1248 /// \headerfile <x86intrin.h>
1249 ///
1250 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1251 ///
1252 /// \param __a
1253 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1254 /// compared to the lower double-precision value of \a __b.
1255 /// \param __b
1256 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1257 /// compared to the lower double-precision value of \a __a.
1258 /// \returns An integer containing the comparison results. If either of the two
1259 /// lower double-precision values is NaN, 0 is returned.
1260 static __inline__ int __DEFAULT_FN_ATTRS
1261 _mm_ucomige_sd(__m128d __a, __m128d __b)
1262 {
1263  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1264 }
1265 
1266 /// Compares the lower double-precision floating-point values in each of
1267 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1268 /// the value in the first parameter is unequal to the corresponding value in
1269 /// the second parameter.
1270 ///
1271 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1272 /// double-precision values is NaN, 1 is returned.
1273 ///
1274 /// \headerfile <x86intrin.h>
1275 ///
1276 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1277 ///
1278 /// \param __a
1279 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1280 /// compared to the lower double-precision value of \a __b.
1281 /// \param __b
1282 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1283 /// compared to the lower double-precision value of \a __a.
1284 /// \returns An integer containing the comparison result. If either of the two
1285 /// lower double-precision values is NaN, 1 is returned.
1286 static __inline__ int __DEFAULT_FN_ATTRS
1287 _mm_ucomineq_sd(__m128d __a, __m128d __b)
1288 {
1289  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1290 }
1291 
1292 /// Converts the two double-precision floating-point elements of a
1293 /// 128-bit vector of [2 x double] into two single-precision floating-point
1294 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1295 /// The upper 64 bits of the result vector are set to zero.
1296 ///
1297 /// \headerfile <x86intrin.h>
1298 ///
1299 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1300 ///
1301 /// \param __a
1302 /// A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1304 /// converted values. The upper 64 bits are set to zero.
1305 static __inline__ __m128 __DEFAULT_FN_ATTRS
1306 _mm_cvtpd_ps(__m128d __a)
1307 {
1308  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1309 }
1310 
1311 /// Converts the lower two single-precision floating-point elements of a
1312 /// 128-bit vector of [4 x float] into two double-precision floating-point
1313 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1314 /// elements of the input vector are unused.
1315 ///
1316 /// \headerfile <x86intrin.h>
1317 ///
1318 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1319 ///
1320 /// \param __a
1321 /// A 128-bit vector of [4 x float]. The lower two single-precision
1322 /// floating-point elements are converted to double-precision values. The
1323 /// upper two elements are unused.
1324 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1325 static __inline__ __m128d __DEFAULT_FN_ATTRS
1326 _mm_cvtps_pd(__m128 __a)
1327 {
1328  return (__m128d) __builtin_convertvector(
1329  __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1330 }
1331 
1332 /// Converts the lower two integer elements of a 128-bit vector of
1333 /// [4 x i32] into two double-precision floating-point values, returned in a
1334 /// 128-bit vector of [2 x double].
1335 ///
1336 /// The upper two elements of the input vector are unused.
1337 ///
1338 /// \headerfile <x86intrin.h>
1339 ///
1340 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1341 ///
1342 /// \param __a
1343 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1344 /// converted to double-precision values.
1345 ///
1346 /// The upper two elements are unused.
1347 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1348 static __inline__ __m128d __DEFAULT_FN_ATTRS
1349 _mm_cvtepi32_pd(__m128i __a)
1350 {
1351  return (__m128d) __builtin_convertvector(
1352  __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1353 }
1354 
1355 /// Converts the two double-precision floating-point elements of a
1356 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1357 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1358 /// 64 bits of the result vector are set to zero.
1359 ///
1360 /// \headerfile <x86intrin.h>
1361 ///
1362 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1363 ///
1364 /// \param __a
1365 /// A 128-bit vector of [2 x double].
1366 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1367 /// converted values. The upper 64 bits are set to zero.
1368 static __inline__ __m128i __DEFAULT_FN_ATTRS
1369 _mm_cvtpd_epi32(__m128d __a)
1370 {
1371  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1372 }
1373 
1374 /// Converts the low-order element of a 128-bit vector of [2 x double]
1375 /// into a 32-bit signed integer value.
1376 ///
1377 /// \headerfile <x86intrin.h>
1378 ///
1379 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1380 ///
1381 /// \param __a
1382 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1383 /// conversion.
1384 /// \returns A 32-bit signed integer containing the converted value.
1385 static __inline__ int __DEFAULT_FN_ATTRS
1386 _mm_cvtsd_si32(__m128d __a)
1387 {
1388  return __builtin_ia32_cvtsd2si((__v2df)__a);
1389 }
1390 
1391 /// Converts the lower double-precision floating-point element of a
1392 /// 128-bit vector of [2 x double], in the second parameter, into a
1393 /// single-precision floating-point value, returned in the lower 32 bits of a
1394 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1395 /// copied from the upper 96 bits of the first parameter.
1396 ///
1397 /// \headerfile <x86intrin.h>
1398 ///
1399 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1400 ///
1401 /// \param __a
1402 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1403 /// copied to the upper 96 bits of the result.
1404 /// \param __b
1405 /// A 128-bit vector of [2 x double]. The lower double-precision
1406 /// floating-point element is used in the conversion.
1407 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1408 /// converted value from the second parameter. The upper 96 bits are copied
1409 /// from the upper 96 bits of the first parameter.
1410 static __inline__ __m128 __DEFAULT_FN_ATTRS
1411 _mm_cvtsd_ss(__m128 __a, __m128d __b)
1412 {
1413  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1414 }
1415 
1416 /// Converts a 32-bit signed integer value, in the second parameter, into
1417 /// a double-precision floating-point value, returned in the lower 64 bits of
1418 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1419 /// are copied from the upper 64 bits of the first parameter.
1420 ///
1421 /// \headerfile <x86intrin.h>
1422 ///
1423 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1424 ///
1425 /// \param __a
1426 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1427 /// copied to the upper 64 bits of the result.
1428 /// \param __b
1429 /// A 32-bit signed integer containing the value to be converted.
1430 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1431 /// converted value from the second parameter. The upper 64 bits are copied
1432 /// from the upper 64 bits of the first parameter.
1433 static __inline__ __m128d __DEFAULT_FN_ATTRS
1434 _mm_cvtsi32_sd(__m128d __a, int __b)
1435 {
1436  __a[0] = __b;
1437  return __a;
1438 }
1439 
1440 /// Converts the lower single-precision floating-point element of a
1441 /// 128-bit vector of [4 x float], in the second parameter, into a
1442 /// double-precision floating-point value, returned in the lower 64 bits of
1443 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1444 /// are copied from the upper 64 bits of the first parameter.
1445 ///
1446 /// \headerfile <x86intrin.h>
1447 ///
1448 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1449 ///
1450 /// \param __a
1451 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1452 /// copied to the upper 64 bits of the result.
1453 /// \param __b
1454 /// A 128-bit vector of [4 x float]. The lower single-precision
1455 /// floating-point element is used in the conversion.
1456 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1457 /// converted value from the second parameter. The upper 64 bits are copied
1458 /// from the upper 64 bits of the first parameter.
1459 static __inline__ __m128d __DEFAULT_FN_ATTRS
1460 _mm_cvtss_sd(__m128d __a, __m128 __b)
1461 {
1462  __a[0] = __b[0];
1463  return __a;
1464 }
1465 
1466 /// Converts the two double-precision floating-point elements of a
1467 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1468 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1469 ///
1470 /// If the result of either conversion is inexact, the result is truncated
1471 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1472 /// 64 bits of the result vector are set to zero.
1473 ///
1474 /// \headerfile <x86intrin.h>
1475 ///
1476 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1477 /// instruction.
1478 ///
1479 /// \param __a
1480 /// A 128-bit vector of [2 x double].
1481 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1482 /// converted values. The upper 64 bits are set to zero.
1483 static __inline__ __m128i __DEFAULT_FN_ATTRS
1484 _mm_cvttpd_epi32(__m128d __a)
1485 {
1486  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1487 }
1488 
1489 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1490 /// signed integer value, truncating the result when it is inexact.
1491 ///
1492 /// \headerfile <x86intrin.h>
1493 ///
1494 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1495 /// instruction.
1496 ///
1497 /// \param __a
1498 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1499 /// conversion.
1500 /// \returns A 32-bit signed integer containing the converted value.
1501 static __inline__ int __DEFAULT_FN_ATTRS
1502 _mm_cvttsd_si32(__m128d __a)
1503 {
1504  return __builtin_ia32_cvttsd2si((__v2df)__a);
1505 }
1506 
1507 /// Converts the two double-precision floating-point elements of a
1508 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1509 /// returned in a 64-bit vector of [2 x i32].
1510 ///
1511 /// \headerfile <x86intrin.h>
1512 ///
1513 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1514 ///
1515 /// \param __a
1516 /// A 128-bit vector of [2 x double].
1517 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1518 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1519 _mm_cvtpd_pi32(__m128d __a)
1520 {
1521  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1522 }
1523 
1524 /// Converts the two double-precision floating-point elements of a
1525 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1526 /// returned in a 64-bit vector of [2 x i32].
1527 ///
1528 /// If the result of either conversion is inexact, the result is truncated
1529 /// (rounded towards zero) regardless of the current MXCSR setting.
1530 ///
1531 /// \headerfile <x86intrin.h>
1532 ///
1533 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1534 ///
1535 /// \param __a
1536 /// A 128-bit vector of [2 x double].
1537 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1538 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1539 _mm_cvttpd_pi32(__m128d __a)
1540 {
1541  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1542 }
1543 
1544 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1545 /// [2 x i32] into two double-precision floating-point values, returned in a
1546 /// 128-bit vector of [2 x double].
1547 ///
1548 /// \headerfile <x86intrin.h>
1549 ///
1550 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1551 ///
1552 /// \param __a
1553 /// A 64-bit vector of [2 x i32].
1554 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1555 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
1556 _mm_cvtpi32_pd(__m64 __a)
1557 {
1558  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1559 }
1560 
1561 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1562 /// a double-precision floating-point value.
1563 ///
1564 /// \headerfile <x86intrin.h>
1565 ///
1566 /// This intrinsic has no corresponding instruction.
1567 ///
1568 /// \param __a
1569 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1570 /// \returns A double-precision floating-point value copied from the lower 64
1571 /// bits of \a __a.
1572 static __inline__ double __DEFAULT_FN_ATTRS
1573 _mm_cvtsd_f64(__m128d __a)
1574 {
1575  return __a[0];
1576 }
1577 
1578 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1579 /// memory location.
1580 ///
1581 /// \headerfile <x86intrin.h>
1582 ///
1583 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1584 ///
1585 /// \param __dp
1586 /// A pointer to a 128-bit memory location. The address of the memory
1587 /// location has to be 16-byte aligned.
1588 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1589 static __inline__ __m128d __DEFAULT_FN_ATTRS
1590 _mm_load_pd(double const *__dp)
1591 {
1592  return *(__m128d*)__dp;
1593 }
1594 
1595 /// Loads a double-precision floating-point value from a specified memory
1596 /// location and duplicates it to both vector elements of a 128-bit vector of
1597 /// [2 x double].
1598 ///
1599 /// \headerfile <x86intrin.h>
1600 ///
1601 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1602 ///
1603 /// \param __dp
1604 /// A pointer to a memory location containing a double-precision value.
1605 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1606 /// duplicated values.
1607 static __inline__ __m128d __DEFAULT_FN_ATTRS
1608 _mm_load1_pd(double const *__dp)
1609 {
1610  struct __mm_load1_pd_struct {
1611  double __u;
1612  } __attribute__((__packed__, __may_alias__));
1613  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1614  return __extension__ (__m128d){ __u, __u };
1615 }
1616 
1617 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1618 
1619 /// Loads two double-precision values, in reverse order, from an aligned
1620 /// memory location into a 128-bit vector of [2 x double].
1621 ///
1622 /// \headerfile <x86intrin.h>
1623 ///
1624 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1625 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1626 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1627 ///
1628 /// \param __dp
1629 /// A 16-byte aligned pointer to an array of double-precision values to be
1630 /// loaded in reverse order.
1631 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1632 /// values.
1633 static __inline__ __m128d __DEFAULT_FN_ATTRS
1634 _mm_loadr_pd(double const *__dp)
1635 {
1636  __m128d __u = *(__m128d*)__dp;
1637  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1638 }
1639 
1640 /// Loads a 128-bit floating-point vector of [2 x double] from an
1641 /// unaligned memory location.
1642 ///
1643 /// \headerfile <x86intrin.h>
1644 ///
1645 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1646 ///
1647 /// \param __dp
1648 /// A pointer to a 128-bit memory location. The address of the memory
1649 /// location does not have to be aligned.
1650 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1651 static __inline__ __m128d __DEFAULT_FN_ATTRS
1652 _mm_loadu_pd(double const *__dp)
1653 {
1654  struct __loadu_pd {
1655  __m128d __v;
1656  } __attribute__((__packed__, __may_alias__));
1657  return ((struct __loadu_pd*)__dp)->__v;
1658 }
1659 
1660 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1661 /// vector and clears the upper element.
1662 ///
1663 /// \headerfile <x86intrin.h>
1664 ///
1665 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1666 ///
1667 /// \param __a
1668 /// A pointer to a 64-bit memory location. The address of the memory
1669 /// location does not have to be aligned.
1670 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1671 static __inline__ __m128i __DEFAULT_FN_ATTRS
1672 _mm_loadu_si64(void const *__a)
1673 {
1674  struct __loadu_si64 {
1675  long long __v;
1676  } __attribute__((__packed__, __may_alias__));
1677  long long __u = ((struct __loadu_si64*)__a)->__v;
1678  return __extension__ (__m128i)(__v2di){__u, 0L};
1679 }
1680 
1681 /// Loads a 64-bit double-precision value to the low element of a
1682 /// 128-bit integer vector and clears the upper element.
1683 ///
1684 /// \headerfile <x86intrin.h>
1685 ///
1686 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1687 ///
1688 /// \param __dp
1689 /// A pointer to a memory location containing a double-precision value.
1690 /// The address of the memory location does not have to be aligned.
1691 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1692 static __inline__ __m128d __DEFAULT_FN_ATTRS
1693 _mm_load_sd(double const *__dp)
1694 {
1695  struct __mm_load_sd_struct {
1696  double __u;
1697  } __attribute__((__packed__, __may_alias__));
1698  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1699  return __extension__ (__m128d){ __u, 0 };
1700 }
1701 
1702 /// Loads a double-precision value into the high-order bits of a 128-bit
1703 /// vector of [2 x double]. The low-order bits are copied from the low-order
1704 /// bits of the first operand.
1705 ///
1706 /// \headerfile <x86intrin.h>
1707 ///
1708 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1709 ///
1710 /// \param __a
1711 /// A 128-bit vector of [2 x double]. \n
1712 /// Bits [63:0] are written to bits [63:0] of the result.
1713 /// \param __dp
1714 /// A pointer to a 64-bit memory location containing a double-precision
1715 /// floating-point value that is loaded. The loaded value is written to bits
1716 /// [127:64] of the result. The address of the memory location does not have
1717 /// to be aligned.
1718 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1719 static __inline__ __m128d __DEFAULT_FN_ATTRS
1720 _mm_loadh_pd(__m128d __a, double const *__dp)
1721 {
1722  struct __mm_loadh_pd_struct {
1723  double __u;
1724  } __attribute__((__packed__, __may_alias__));
1725  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1726  return __extension__ (__m128d){ __a[0], __u };
1727 }
1728 
1729 /// Loads a double-precision value into the low-order bits of a 128-bit
1730 /// vector of [2 x double]. The high-order bits are copied from the
1731 /// high-order bits of the first operand.
1732 ///
1733 /// \headerfile <x86intrin.h>
1734 ///
1735 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1736 ///
1737 /// \param __a
1738 /// A 128-bit vector of [2 x double]. \n
1739 /// Bits [127:64] are written to bits [127:64] of the result.
1740 /// \param __dp
1741 /// A pointer to a 64-bit memory location containing a double-precision
1742 /// floating-point value that is loaded. The loaded value is written to bits
1743 /// [63:0] of the result. The address of the memory location does not have to
1744 /// be aligned.
1745 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1746 static __inline__ __m128d __DEFAULT_FN_ATTRS
1747 _mm_loadl_pd(__m128d __a, double const *__dp)
1748 {
1749  struct __mm_loadl_pd_struct {
1750  double __u;
1751  } __attribute__((__packed__, __may_alias__));
1752  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1753  return __extension__ (__m128d){ __u, __a[1] };
1754 }
1755 
1756 /// Constructs a 128-bit floating-point vector of [2 x double] with
1757 /// unspecified content. This could be used as an argument to another
1758 /// intrinsic function where the argument is required but the value is not
1759 /// actually used.
1760 ///
1761 /// \headerfile <x86intrin.h>
1762 ///
1763 /// This intrinsic has no corresponding instruction.
1764 ///
1765 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1766 /// content.
1767 static __inline__ __m128d __DEFAULT_FN_ATTRS
1769 {
1770  return (__m128d)__builtin_ia32_undef128();
1771 }
1772 
1773 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1774 /// 64 bits of the vector are initialized with the specified double-precision
1775 /// floating-point value. The upper 64 bits are set to zero.
1776 ///
1777 /// \headerfile <x86intrin.h>
1778 ///
1779 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1780 ///
1781 /// \param __w
1782 /// A double-precision floating-point value used to initialize the lower 64
1783 /// bits of the result.
1784 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1785 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1786 /// set to zero.
1787 static __inline__ __m128d __DEFAULT_FN_ATTRS
1788 _mm_set_sd(double __w)
1789 {
1790  return __extension__ (__m128d){ __w, 0 };
1791 }
1792 
1793 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1794 /// of the two double-precision floating-point vector elements set to the
1795 /// specified double-precision floating-point value.
1796 ///
1797 /// \headerfile <x86intrin.h>
1798 ///
1799 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1800 ///
1801 /// \param __w
1802 /// A double-precision floating-point value used to initialize each vector
1803 /// element of the result.
1804 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1805 static __inline__ __m128d __DEFAULT_FN_ATTRS
1806 _mm_set1_pd(double __w)
1807 {
1808  return __extension__ (__m128d){ __w, __w };
1809 }
1810 
1811 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1812 /// of the two double-precision floating-point vector elements set to the
1813 /// specified double-precision floating-point value.
1814 ///
1815 /// \headerfile <x86intrin.h>
1816 ///
1817 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1818 ///
1819 /// \param __w
1820 /// A double-precision floating-point value used to initialize each vector
1821 /// element of the result.
1822 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1823 static __inline__ __m128d __DEFAULT_FN_ATTRS
1824 _mm_set_pd1(double __w)
1825 {
1826  return _mm_set1_pd(__w);
1827 }
1828 
1829 /// Constructs a 128-bit floating-point vector of [2 x double]
1830 /// initialized with the specified double-precision floating-point values.
1831 ///
1832 /// \headerfile <x86intrin.h>
1833 ///
1834 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1835 ///
1836 /// \param __w
1837 /// A double-precision floating-point value used to initialize the upper 64
1838 /// bits of the result.
1839 /// \param __x
1840 /// A double-precision floating-point value used to initialize the lower 64
1841 /// bits of the result.
1842 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1843 static __inline__ __m128d __DEFAULT_FN_ATTRS
1844 _mm_set_pd(double __w, double __x)
1845 {
1846  return __extension__ (__m128d){ __x, __w };
1847 }
1848 
1849 /// Constructs a 128-bit floating-point vector of [2 x double],
1850 /// initialized in reverse order with the specified double-precision
1851 /// floating-point values.
1852 ///
1853 /// \headerfile <x86intrin.h>
1854 ///
1855 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1856 ///
1857 /// \param __w
1858 /// A double-precision floating-point value used to initialize the lower 64
1859 /// bits of the result.
1860 /// \param __x
1861 /// A double-precision floating-point value used to initialize the upper 64
1862 /// bits of the result.
1863 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1864 static __inline__ __m128d __DEFAULT_FN_ATTRS
1865 _mm_setr_pd(double __w, double __x)
1866 {
1867  return __extension__ (__m128d){ __w, __x };
1868 }
1869 
1870 /// Constructs a 128-bit floating-point vector of [2 x double]
1871 /// initialized to zero.
1872 ///
1873 /// \headerfile <x86intrin.h>
1874 ///
1875 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1876 ///
1877 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1878 /// all elements set to zero.
1879 static __inline__ __m128d __DEFAULT_FN_ATTRS
1881 {
1882  return __extension__ (__m128d){ 0, 0 };
1883 }
1884 
1885 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1886 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1887 /// 64 bits are set to the upper 64 bits of the first parameter.
1888 ///
1889 /// \headerfile <x86intrin.h>
1890 ///
1891 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1892 ///
1893 /// \param __a
1894 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1895 /// upper 64 bits of the result.
1896 /// \param __b
1897 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1898 /// lower 64 bits of the result.
1899 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1900 static __inline__ __m128d __DEFAULT_FN_ATTRS
1901 _mm_move_sd(__m128d __a, __m128d __b)
1902 {
1903  return __extension__ (__m128d){ __b[0], __a[1] };
1904 }
1905 
1906 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1907 /// memory location.
1908 ///
1909 /// \headerfile <x86intrin.h>
1910 ///
1911 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1912 ///
1913 /// \param __dp
1914 /// A pointer to a 64-bit memory location.
1915 /// \param __a
1916 /// A 128-bit vector of [2 x double] containing the value to be stored.
1917 static __inline__ void __DEFAULT_FN_ATTRS
1918 _mm_store_sd(double *__dp, __m128d __a)
1919 {
1920  struct __mm_store_sd_struct {
1921  double __u;
1922  } __attribute__((__packed__, __may_alias__));
1923  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1924 }
1925 
1926 /// Moves packed double-precision values from a 128-bit vector of
1927 /// [2 x double] to a memory location.
1928 ///
1929 /// \headerfile <x86intrin.h>
1930 ///
1931 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1932 ///
1933 /// \param __dp
1934 /// A pointer to an aligned memory location that can store two
1935 /// double-precision values.
1936 /// \param __a
1937 /// A packed 128-bit vector of [2 x double] containing the values to be
1938 /// moved.
1939 static __inline__ void __DEFAULT_FN_ATTRS
1940 _mm_store_pd(double *__dp, __m128d __a)
1941 {
1942  *(__m128d*)__dp = __a;
1943 }
1944 
1945 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1946 /// the upper and lower 64 bits of a memory location.
1947 ///
1948 /// \headerfile <x86intrin.h>
1949 ///
1950 /// This intrinsic corresponds to the
1951 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1952 ///
1953 /// \param __dp
1954 /// A pointer to a memory location that can store two double-precision
1955 /// values.
1956 /// \param __a
1957 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1958 /// of the values in \a __dp.
1959 static __inline__ void __DEFAULT_FN_ATTRS
1960 _mm_store1_pd(double *__dp, __m128d __a)
1961 {
1962  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1963  _mm_store_pd(__dp, __a);
1964 }
1965 
1966 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1967 /// the upper and lower 64 bits of a memory location.
1968 ///
1969 /// \headerfile <x86intrin.h>
1970 ///
1971 /// This intrinsic corresponds to the
1972 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1973 ///
1974 /// \param __dp
1975 /// A pointer to a memory location that can store two double-precision
1976 /// values.
1977 /// \param __a
1978 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1979 /// of the values in \a __dp.
1980 static __inline__ void __DEFAULT_FN_ATTRS
1981 _mm_store_pd1(double *__dp, __m128d __a)
1982 {
1983  _mm_store1_pd(__dp, __a);
1984 }
1985 
1986 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1987 /// location.
1988 ///
1989 /// \headerfile <x86intrin.h>
1990 ///
1991 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1992 ///
1993 /// \param __dp
1994 /// A pointer to a 128-bit memory location. The address of the memory
1995 /// location does not have to be aligned.
1996 /// \param __a
1997 /// A 128-bit vector of [2 x double] containing the values to be stored.
1998 static __inline__ void __DEFAULT_FN_ATTRS
1999 _mm_storeu_pd(double *__dp, __m128d __a)
2000 {
2001  struct __storeu_pd {
2002  __m128d __v;
2003  } __attribute__((__packed__, __may_alias__));
2004  ((struct __storeu_pd*)__dp)->__v = __a;
2005 }
2006 
2007 /// Stores two double-precision values, in reverse order, from a 128-bit
2008 /// vector of [2 x double] to a 16-byte aligned memory location.
2009 ///
2010 /// \headerfile <x86intrin.h>
2011 ///
2012 /// This intrinsic corresponds to a shuffling instruction followed by a
2013 /// <c> VMOVAPD / MOVAPD </c> instruction.
2014 ///
2015 /// \param __dp
2016 /// A pointer to a 16-byte aligned memory location that can store two
2017 /// double-precision values.
2018 /// \param __a
2019 /// A 128-bit vector of [2 x double] containing the values to be reversed and
2020 /// stored.
2021 static __inline__ void __DEFAULT_FN_ATTRS
2022 _mm_storer_pd(double *__dp, __m128d __a)
2023 {
2024  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2025  *(__m128d *)__dp = __a;
2026 }
2027 
2028 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2029 /// memory location.
2030 ///
2031 /// \headerfile <x86intrin.h>
2032 ///
2033 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2034 ///
2035 /// \param __dp
2036 /// A pointer to a 64-bit memory location.
2037 /// \param __a
2038 /// A 128-bit vector of [2 x double] containing the value to be stored.
2039 static __inline__ void __DEFAULT_FN_ATTRS
2040 _mm_storeh_pd(double *__dp, __m128d __a)
2041 {
2042  struct __mm_storeh_pd_struct {
2043  double __u;
2044  } __attribute__((__packed__, __may_alias__));
2045  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2046 }
2047 
2048 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2049 /// memory location.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2054 ///
2055 /// \param __dp
2056 /// A pointer to a 64-bit memory location.
2057 /// \param __a
2058 /// A 128-bit vector of [2 x double] containing the value to be stored.
2059 static __inline__ void __DEFAULT_FN_ATTRS
2060 _mm_storel_pd(double *__dp, __m128d __a)
2061 {
2062  struct __mm_storeh_pd_struct {
2063  double __u;
2064  } __attribute__((__packed__, __may_alias__));
2065  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2066 }
2067 
2068 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2069 /// saving the lower 8 bits of each sum in the corresponding element of a
2070 /// 128-bit result vector of [16 x i8].
2071 ///
2072 /// The integer elements of both parameters can be either signed or unsigned.
2073 ///
2074 /// \headerfile <x86intrin.h>
2075 ///
2076 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2077 ///
2078 /// \param __a
2079 /// A 128-bit vector of [16 x i8].
2080 /// \param __b
2081 /// A 128-bit vector of [16 x i8].
2082 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2083 /// parameters.
2084 static __inline__ __m128i __DEFAULT_FN_ATTRS
2085 _mm_add_epi8(__m128i __a, __m128i __b)
2086 {
2087  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2088 }
2089 
2090 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2091 /// saving the lower 16 bits of each sum in the corresponding element of a
2092 /// 128-bit result vector of [8 x i16].
2093 ///
2094 /// The integer elements of both parameters can be either signed or unsigned.
2095 ///
2096 /// \headerfile <x86intrin.h>
2097 ///
2098 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2099 ///
2100 /// \param __a
2101 /// A 128-bit vector of [8 x i16].
2102 /// \param __b
2103 /// A 128-bit vector of [8 x i16].
2104 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2105 /// parameters.
2106 static __inline__ __m128i __DEFAULT_FN_ATTRS
2107 _mm_add_epi16(__m128i __a, __m128i __b)
2108 {
2109  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2110 }
2111 
2112 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2113 /// saving the lower 32 bits of each sum in the corresponding element of a
2114 /// 128-bit result vector of [4 x i32].
2115 ///
2116 /// The integer elements of both parameters can be either signed or unsigned.
2117 ///
2118 /// \headerfile <x86intrin.h>
2119 ///
2120 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2121 ///
2122 /// \param __a
2123 /// A 128-bit vector of [4 x i32].
2124 /// \param __b
2125 /// A 128-bit vector of [4 x i32].
2126 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2127 /// parameters.
2128 static __inline__ __m128i __DEFAULT_FN_ATTRS
2129 _mm_add_epi32(__m128i __a, __m128i __b)
2130 {
2131  return (__m128i)((__v4su)__a + (__v4su)__b);
2132 }
2133 
2134 /// Adds two signed or unsigned 64-bit integer values, returning the
2135 /// lower 64 bits of the sum.
2136 ///
2137 /// \headerfile <x86intrin.h>
2138 ///
2139 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2140 ///
2141 /// \param __a
2142 /// A 64-bit integer.
2143 /// \param __b
2144 /// A 64-bit integer.
2145 /// \returns A 64-bit integer containing the sum of both parameters.
2146 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2147 _mm_add_si64(__m64 __a, __m64 __b)
2148 {
2149  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2150 }
2151 
2152 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2153 /// saving the lower 64 bits of each sum in the corresponding element of a
2154 /// 128-bit result vector of [2 x i64].
2155 ///
2156 /// The integer elements of both parameters can be either signed or unsigned.
2157 ///
2158 /// \headerfile <x86intrin.h>
2159 ///
2160 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2161 ///
2162 /// \param __a
2163 /// A 128-bit vector of [2 x i64].
2164 /// \param __b
2165 /// A 128-bit vector of [2 x i64].
2166 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2167 /// parameters.
2168 static __inline__ __m128i __DEFAULT_FN_ATTRS
2169 _mm_add_epi64(__m128i __a, __m128i __b)
2170 {
2171  return (__m128i)((__v2du)__a + (__v2du)__b);
2172 }
2173 
2174 /// Adds, with saturation, the corresponding elements of two 128-bit
2175 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2176 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2177 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2178 ///
2179 /// \headerfile <x86intrin.h>
2180 ///
2181 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2182 ///
2183 /// \param __a
2184 /// A 128-bit signed [16 x i8] vector.
2185 /// \param __b
2186 /// A 128-bit signed [16 x i8] vector.
2187 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2188 /// both parameters.
2189 static __inline__ __m128i __DEFAULT_FN_ATTRS
2190 _mm_adds_epi8(__m128i __a, __m128i __b)
2191 {
2192  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2193 }
2194 
2195 /// Adds, with saturation, the corresponding elements of two 128-bit
2196 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2197 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2198 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2199 /// 0x8000.
2200 ///
2201 /// \headerfile <x86intrin.h>
2202 ///
2203 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2204 ///
2205 /// \param __a
2206 /// A 128-bit signed [8 x i16] vector.
2207 /// \param __b
2208 /// A 128-bit signed [8 x i16] vector.
2209 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2210 /// both parameters.
2211 static __inline__ __m128i __DEFAULT_FN_ATTRS
2212 _mm_adds_epi16(__m128i __a, __m128i __b)
2213 {
2214  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2215 }
2216 
2217 /// Adds, with saturation, the corresponding elements of two 128-bit
2218 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2219 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2220 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2221 ///
2222 /// \headerfile <x86intrin.h>
2223 ///
2224 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2225 ///
2226 /// \param __a
2227 /// A 128-bit unsigned [16 x i8] vector.
2228 /// \param __b
2229 /// A 128-bit unsigned [16 x i8] vector.
2230 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2231 /// of both parameters.
2232 static __inline__ __m128i __DEFAULT_FN_ATTRS
2233 _mm_adds_epu8(__m128i __a, __m128i __b)
2234 {
2235  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2236 }
2237 
2238 /// Adds, with saturation, the corresponding elements of two 128-bit
2239 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2240 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2241 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2242 ///
2243 /// \headerfile <x86intrin.h>
2244 ///
2245 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2246 ///
2247 /// \param __a
2248 /// A 128-bit unsigned [8 x i16] vector.
2249 /// \param __b
2250 /// A 128-bit unsigned [8 x i16] vector.
2251 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2252 /// of both parameters.
2253 static __inline__ __m128i __DEFAULT_FN_ATTRS
2254 _mm_adds_epu16(__m128i __a, __m128i __b)
2255 {
2256  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2257 }
2258 
2259 /// Computes the rounded avarages of corresponding elements of two
2260 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2261 /// corresponding element of a 128-bit result vector of [16 x i8].
2262 ///
2263 /// \headerfile <x86intrin.h>
2264 ///
2265 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2266 ///
2267 /// \param __a
2268 /// A 128-bit unsigned [16 x i8] vector.
2269 /// \param __b
2270 /// A 128-bit unsigned [16 x i8] vector.
2271 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2272 /// averages of both parameters.
2273 static __inline__ __m128i __DEFAULT_FN_ATTRS
2274 _mm_avg_epu8(__m128i __a, __m128i __b)
2275 {
2276  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
2277  return (__m128i)__builtin_convertvector(
2278  ((__builtin_convertvector((__v16qu)__a, __v16hu) +
2279  __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
2280  >> 1, __v16qu);
2281 }
2282 
2283 /// Computes the rounded avarages of corresponding elements of two
2284 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2285 /// corresponding element of a 128-bit result vector of [8 x i16].
2286 ///
2287 /// \headerfile <x86intrin.h>
2288 ///
2289 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2290 ///
2291 /// \param __a
2292 /// A 128-bit unsigned [8 x i16] vector.
2293 /// \param __b
2294 /// A 128-bit unsigned [8 x i16] vector.
2295 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2296 /// averages of both parameters.
2297 static __inline__ __m128i __DEFAULT_FN_ATTRS
2298 _mm_avg_epu16(__m128i __a, __m128i __b)
2299 {
2300  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
2301  return (__m128i)__builtin_convertvector(
2302  ((__builtin_convertvector((__v8hu)__a, __v8su) +
2303  __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
2304  >> 1, __v8hu);
2305 }
2306 
2307 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2308 /// vectors, producing eight intermediate 32-bit signed integer products, and
2309 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2310 /// [4 x i32] vector.
2311 ///
2312 /// For example, bits [15:0] of both parameters are multiplied producing a
2313 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2314 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2315 /// of the result.
2316 ///
2317 /// \headerfile <x86intrin.h>
2318 ///
2319 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2320 ///
2321 /// \param __a
2322 /// A 128-bit signed [8 x i16] vector.
2323 /// \param __b
2324 /// A 128-bit signed [8 x i16] vector.
2325 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2326 /// of both parameters.
2327 static __inline__ __m128i __DEFAULT_FN_ATTRS
2328 _mm_madd_epi16(__m128i __a, __m128i __b)
2329 {
2330  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2331 }
2332 
2333 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2334 /// vectors, saving the greater value from each comparison in the
2335 /// corresponding element of a 128-bit result vector of [8 x i16].
2336 ///
2337 /// \headerfile <x86intrin.h>
2338 ///
2339 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2340 ///
2341 /// \param __a
2342 /// A 128-bit signed [8 x i16] vector.
2343 /// \param __b
2344 /// A 128-bit signed [8 x i16] vector.
2345 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2346 /// each comparison.
2347 static __inline__ __m128i __DEFAULT_FN_ATTRS
2348 _mm_max_epi16(__m128i __a, __m128i __b)
2349 {
2350  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2351 }
2352 
2353 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2354 /// vectors, saving the greater value from each comparison in the
2355 /// corresponding element of a 128-bit result vector of [16 x i8].
2356 ///
2357 /// \headerfile <x86intrin.h>
2358 ///
2359 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2360 ///
2361 /// \param __a
2362 /// A 128-bit unsigned [16 x i8] vector.
2363 /// \param __b
2364 /// A 128-bit unsigned [16 x i8] vector.
2365 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2366 /// each comparison.
2367 static __inline__ __m128i __DEFAULT_FN_ATTRS
2368 _mm_max_epu8(__m128i __a, __m128i __b)
2369 {
2370  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2371 }
2372 
2373 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2374 /// vectors, saving the smaller value from each comparison in the
2375 /// corresponding element of a 128-bit result vector of [8 x i16].
2376 ///
2377 /// \headerfile <x86intrin.h>
2378 ///
2379 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2380 ///
2381 /// \param __a
2382 /// A 128-bit signed [8 x i16] vector.
2383 /// \param __b
2384 /// A 128-bit signed [8 x i16] vector.
2385 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2386 /// each comparison.
2387 static __inline__ __m128i __DEFAULT_FN_ATTRS
2388 _mm_min_epi16(__m128i __a, __m128i __b)
2389 {
2390  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2391 }
2392 
2393 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2394 /// vectors, saving the smaller value from each comparison in the
2395 /// corresponding element of a 128-bit result vector of [16 x i8].
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2400 ///
2401 /// \param __a
2402 /// A 128-bit unsigned [16 x i8] vector.
2403 /// \param __b
2404 /// A 128-bit unsigned [16 x i8] vector.
2405 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2406 /// each comparison.
2407 static __inline__ __m128i __DEFAULT_FN_ATTRS
2408 _mm_min_epu8(__m128i __a, __m128i __b)
2409 {
2410  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2411 }
2412 
2413 /// Multiplies the corresponding elements of two signed [8 x i16]
2414 /// vectors, saving the upper 16 bits of each 32-bit product in the
2415 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2416 ///
2417 /// \headerfile <x86intrin.h>
2418 ///
2419 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2420 ///
2421 /// \param __a
2422 /// A 128-bit signed [8 x i16] vector.
2423 /// \param __b
2424 /// A 128-bit signed [8 x i16] vector.
2425 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2426 /// each of the eight 32-bit products.
2427 static __inline__ __m128i __DEFAULT_FN_ATTRS
2428 _mm_mulhi_epi16(__m128i __a, __m128i __b)
2429 {
2430  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2431 }
2432 
2433 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2434 /// vectors, saving the upper 16 bits of each 32-bit product in the
2435 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2436 ///
2437 /// \headerfile <x86intrin.h>
2438 ///
2439 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2440 ///
2441 /// \param __a
2442 /// A 128-bit unsigned [8 x i16] vector.
2443 /// \param __b
2444 /// A 128-bit unsigned [8 x i16] vector.
2445 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2446 /// of each of the eight 32-bit products.
2447 static __inline__ __m128i __DEFAULT_FN_ATTRS
2448 _mm_mulhi_epu16(__m128i __a, __m128i __b)
2449 {
2450  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2451 }
2452 
2453 /// Multiplies the corresponding elements of two signed [8 x i16]
2454 /// vectors, saving the lower 16 bits of each 32-bit product in the
2455 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2456 ///
2457 /// \headerfile <x86intrin.h>
2458 ///
2459 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2460 ///
2461 /// \param __a
2462 /// A 128-bit signed [8 x i16] vector.
2463 /// \param __b
2464 /// A 128-bit signed [8 x i16] vector.
2465 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2466 /// each of the eight 32-bit products.
2467 static __inline__ __m128i __DEFAULT_FN_ATTRS
2468 _mm_mullo_epi16(__m128i __a, __m128i __b)
2469 {
2470  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2471 }
2472 
2473 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2474 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2475 /// product.
2476 ///
2477 /// \headerfile <x86intrin.h>
2478 ///
2479 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2480 ///
2481 /// \param __a
2482 /// A 64-bit integer containing one of the source operands.
2483 /// \param __b
2484 /// A 64-bit integer containing one of the source operands.
2485 /// \returns A 64-bit integer vector containing the product of both operands.
2486 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2487 _mm_mul_su32(__m64 __a, __m64 __b)
2488 {
2489  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2490 }
2491 
2492 /// Multiplies 32-bit unsigned integer values contained in the lower
2493 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2494 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2495 ///
2496 /// \headerfile <x86intrin.h>
2497 ///
2498 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2499 ///
2500 /// \param __a
2501 /// A [2 x i64] vector containing one of the source operands.
2502 /// \param __b
2503 /// A [2 x i64] vector containing one of the source operands.
2504 /// \returns A [2 x i64] vector containing the product of both operands.
2505 static __inline__ __m128i __DEFAULT_FN_ATTRS
2506 _mm_mul_epu32(__m128i __a, __m128i __b)
2507 {
2508  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2509 }
2510 
2511 /// Computes the absolute differences of corresponding 8-bit integer
2512 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2513 /// separately sums the second 8 absolute differences. Packs these two
2514 /// unsigned 16-bit integer sums into the upper and lower elements of a
2515 /// [2 x i64] vector.
2516 ///
2517 /// \headerfile <x86intrin.h>
2518 ///
2519 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2520 ///
2521 /// \param __a
2522 /// A 128-bit integer vector containing one of the source operands.
2523 /// \param __b
2524 /// A 128-bit integer vector containing one of the source operands.
2525 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2526 /// differences between both operands.
2527 static __inline__ __m128i __DEFAULT_FN_ATTRS
2528 _mm_sad_epu8(__m128i __a, __m128i __b)
2529 {
2530  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2531 }
2532 
2533 /// Subtracts the corresponding 8-bit integer values in the operands.
2534 ///
2535 /// \headerfile <x86intrin.h>
2536 ///
2537 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2538 ///
2539 /// \param __a
2540 /// A 128-bit integer vector containing the minuends.
2541 /// \param __b
2542 /// A 128-bit integer vector containing the subtrahends.
2543 /// \returns A 128-bit integer vector containing the differences of the values
2544 /// in the operands.
2545 static __inline__ __m128i __DEFAULT_FN_ATTRS
2546 _mm_sub_epi8(__m128i __a, __m128i __b)
2547 {
2548  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2549 }
2550 
2551 /// Subtracts the corresponding 16-bit integer values in the operands.
2552 ///
2553 /// \headerfile <x86intrin.h>
2554 ///
2555 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2556 ///
2557 /// \param __a
2558 /// A 128-bit integer vector containing the minuends.
2559 /// \param __b
2560 /// A 128-bit integer vector containing the subtrahends.
2561 /// \returns A 128-bit integer vector containing the differences of the values
2562 /// in the operands.
2563 static __inline__ __m128i __DEFAULT_FN_ATTRS
2564 _mm_sub_epi16(__m128i __a, __m128i __b)
2565 {
2566  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2567 }
2568 
2569 /// Subtracts the corresponding 32-bit integer values in the operands.
2570 ///
2571 /// \headerfile <x86intrin.h>
2572 ///
2573 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2574 ///
2575 /// \param __a
2576 /// A 128-bit integer vector containing the minuends.
2577 /// \param __b
2578 /// A 128-bit integer vector containing the subtrahends.
2579 /// \returns A 128-bit integer vector containing the differences of the values
2580 /// in the operands.
2581 static __inline__ __m128i __DEFAULT_FN_ATTRS
2582 _mm_sub_epi32(__m128i __a, __m128i __b)
2583 {
2584  return (__m128i)((__v4su)__a - (__v4su)__b);
2585 }
2586 
2587 /// Subtracts signed or unsigned 64-bit integer values and writes the
2588 /// difference to the corresponding bits in the destination.
2589 ///
2590 /// \headerfile <x86intrin.h>
2591 ///
2592 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2593 ///
2594 /// \param __a
2595 /// A 64-bit integer vector containing the minuend.
2596 /// \param __b
2597 /// A 64-bit integer vector containing the subtrahend.
2598 /// \returns A 64-bit integer vector containing the difference of the values in
2599 /// the operands.
2600 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2601 _mm_sub_si64(__m64 __a, __m64 __b)
2602 {
2603  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2604 }
2605 
2606 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2607 ///
2608 /// \headerfile <x86intrin.h>
2609 ///
2610 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2611 ///
2612 /// \param __a
2613 /// A 128-bit integer vector containing the minuends.
2614 /// \param __b
2615 /// A 128-bit integer vector containing the subtrahends.
2616 /// \returns A 128-bit integer vector containing the differences of the values
2617 /// in the operands.
2618 static __inline__ __m128i __DEFAULT_FN_ATTRS
2619 _mm_sub_epi64(__m128i __a, __m128i __b)
2620 {
2621  return (__m128i)((__v2du)__a - (__v2du)__b);
2622 }
2623 
2624 /// Subtracts corresponding 8-bit signed integer values in the input and
2625 /// returns the differences in the corresponding bytes in the destination.
2626 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2627 /// than 0x80 are saturated to 0x80.
2628 ///
2629 /// \headerfile <x86intrin.h>
2630 ///
2631 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2632 ///
2633 /// \param __a
2634 /// A 128-bit integer vector containing the minuends.
2635 /// \param __b
2636 /// A 128-bit integer vector containing the subtrahends.
2637 /// \returns A 128-bit integer vector containing the differences of the values
2638 /// in the operands.
2639 static __inline__ __m128i __DEFAULT_FN_ATTRS
2640 _mm_subs_epi8(__m128i __a, __m128i __b)
2641 {
2642  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2643 }
2644 
2645 /// Subtracts corresponding 16-bit signed integer values in the input and
2646 /// returns the differences in the corresponding bytes in the destination.
2647 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2648 /// than 0x8000 are saturated to 0x8000.
2649 ///
2650 /// \headerfile <x86intrin.h>
2651 ///
2652 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2653 ///
2654 /// \param __a
2655 /// A 128-bit integer vector containing the minuends.
2656 /// \param __b
2657 /// A 128-bit integer vector containing the subtrahends.
2658 /// \returns A 128-bit integer vector containing the differences of the values
2659 /// in the operands.
2660 static __inline__ __m128i __DEFAULT_FN_ATTRS
2661 _mm_subs_epi16(__m128i __a, __m128i __b)
2662 {
2663  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2664 }
2665 
2666 /// Subtracts corresponding 8-bit unsigned integer values in the input
2667 /// and returns the differences in the corresponding bytes in the
2668 /// destination. Differences less than 0x00 are saturated to 0x00.
2669 ///
2670 /// \headerfile <x86intrin.h>
2671 ///
2672 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2673 ///
2674 /// \param __a
2675 /// A 128-bit integer vector containing the minuends.
2676 /// \param __b
2677 /// A 128-bit integer vector containing the subtrahends.
2678 /// \returns A 128-bit integer vector containing the unsigned integer
2679 /// differences of the values in the operands.
2680 static __inline__ __m128i __DEFAULT_FN_ATTRS
2681 _mm_subs_epu8(__m128i __a, __m128i __b)
2682 {
2683  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2684 }
2685 
2686 /// Subtracts corresponding 16-bit unsigned integer values in the input
2687 /// and returns the differences in the corresponding bytes in the
2688 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2689 ///
2690 /// \headerfile <x86intrin.h>
2691 ///
2692 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2693 ///
2694 /// \param __a
2695 /// A 128-bit integer vector containing the minuends.
2696 /// \param __b
2697 /// A 128-bit integer vector containing the subtrahends.
2698 /// \returns A 128-bit integer vector containing the unsigned integer
2699 /// differences of the values in the operands.
2700 static __inline__ __m128i __DEFAULT_FN_ATTRS
2701 _mm_subs_epu16(__m128i __a, __m128i __b)
2702 {
2703  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2704 }
2705 
2706 /// Performs a bitwise AND of two 128-bit integer vectors.
2707 ///
2708 /// \headerfile <x86intrin.h>
2709 ///
2710 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2711 ///
2712 /// \param __a
2713 /// A 128-bit integer vector containing one of the source operands.
2714 /// \param __b
2715 /// A 128-bit integer vector containing one of the source operands.
2716 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2717 /// in both operands.
2718 static __inline__ __m128i __DEFAULT_FN_ATTRS
2719 _mm_and_si128(__m128i __a, __m128i __b)
2720 {
2721  return (__m128i)((__v2du)__a & (__v2du)__b);
2722 }
2723 
2724 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2725 /// one's complement of the values contained in the first source operand.
2726 ///
2727 /// \headerfile <x86intrin.h>
2728 ///
2729 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2730 ///
2731 /// \param __a
2732 /// A 128-bit vector containing the left source operand. The one's complement
2733 /// of this value is used in the bitwise AND.
2734 /// \param __b
2735 /// A 128-bit vector containing the right source operand.
2736 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2737 /// complement of the first operand and the values in the second operand.
2738 static __inline__ __m128i __DEFAULT_FN_ATTRS
2739 _mm_andnot_si128(__m128i __a, __m128i __b)
2740 {
2741  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2742 }
2743 /// Performs a bitwise OR of two 128-bit integer vectors.
2744 ///
2745 /// \headerfile <x86intrin.h>
2746 ///
2747 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2748 ///
2749 /// \param __a
2750 /// A 128-bit integer vector containing one of the source operands.
2751 /// \param __b
2752 /// A 128-bit integer vector containing one of the source operands.
2753 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2754 /// in both operands.
2755 static __inline__ __m128i __DEFAULT_FN_ATTRS
2756 _mm_or_si128(__m128i __a, __m128i __b)
2757 {
2758  return (__m128i)((__v2du)__a | (__v2du)__b);
2759 }
2760 
2761 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2762 ///
2763 /// \headerfile <x86intrin.h>
2764 ///
2765 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2766 ///
2767 /// \param __a
2768 /// A 128-bit integer vector containing one of the source operands.
2769 /// \param __b
2770 /// A 128-bit integer vector containing one of the source operands.
2771 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2772 /// values in both operands.
2773 static __inline__ __m128i __DEFAULT_FN_ATTRS
2774 _mm_xor_si128(__m128i __a, __m128i __b)
2775 {
2776  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2777 }
2778 
2779 /// Left-shifts the 128-bit integer vector operand by the specified
2780 /// number of bytes. Low-order bits are cleared.
2781 ///
2782 /// \headerfile <x86intrin.h>
2783 ///
2784 /// \code
2785 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2786 /// \endcode
2787 ///
2788 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2789 ///
2790 /// \param a
2791 /// A 128-bit integer vector containing the source operand.
2792 /// \param imm
2793 /// An immediate value specifying the number of bytes to left-shift operand
2794 /// \a a.
2795 /// \returns A 128-bit integer vector containing the left-shifted value.
2796 #define _mm_slli_si128(a, imm) \
2797  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2798 
2799 #define _mm_bslli_si128(a, imm) \
2800  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2801 
2802 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2803 /// by the specified number of bits. Low-order bits are cleared.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2808 ///
2809 /// \param __a
2810 /// A 128-bit integer vector containing the source operand.
2811 /// \param __count
2812 /// An integer value specifying the number of bits to left-shift each value
2813 /// in operand \a __a.
2814 /// \returns A 128-bit integer vector containing the left-shifted values.
2815 static __inline__ __m128i __DEFAULT_FN_ATTRS
2816 _mm_slli_epi16(__m128i __a, int __count)
2817 {
2818  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2819 }
2820 
2821 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2822 /// by the specified number of bits. Low-order bits are cleared.
2823 ///
2824 /// \headerfile <x86intrin.h>
2825 ///
2826 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2827 ///
2828 /// \param __a
2829 /// A 128-bit integer vector containing the source operand.
2830 /// \param __count
2831 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2832 /// to left-shift each value in operand \a __a.
2833 /// \returns A 128-bit integer vector containing the left-shifted values.
2834 static __inline__ __m128i __DEFAULT_FN_ATTRS
2835 _mm_sll_epi16(__m128i __a, __m128i __count)
2836 {
2837  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2838 }
2839 
2840 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2841 /// by the specified number of bits. Low-order bits are cleared.
2842 ///
2843 /// \headerfile <x86intrin.h>
2844 ///
2845 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2846 ///
2847 /// \param __a
2848 /// A 128-bit integer vector containing the source operand.
2849 /// \param __count
2850 /// An integer value specifying the number of bits to left-shift each value
2851 /// in operand \a __a.
2852 /// \returns A 128-bit integer vector containing the left-shifted values.
2853 static __inline__ __m128i __DEFAULT_FN_ATTRS
2854 _mm_slli_epi32(__m128i __a, int __count)
2855 {
2856  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2857 }
2858 
2859 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2860 /// by the specified number of bits. Low-order bits are cleared.
2861 ///
2862 /// \headerfile <x86intrin.h>
2863 ///
2864 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2865 ///
2866 /// \param __a
2867 /// A 128-bit integer vector containing the source operand.
2868 /// \param __count
2869 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2870 /// to left-shift each value in operand \a __a.
2871 /// \returns A 128-bit integer vector containing the left-shifted values.
2872 static __inline__ __m128i __DEFAULT_FN_ATTRS
2873 _mm_sll_epi32(__m128i __a, __m128i __count)
2874 {
2875  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2876 }
2877 
2878 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2879 /// by the specified number of bits. Low-order bits are cleared.
2880 ///
2881 /// \headerfile <x86intrin.h>
2882 ///
2883 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2884 ///
2885 /// \param __a
2886 /// A 128-bit integer vector containing the source operand.
2887 /// \param __count
2888 /// An integer value specifying the number of bits to left-shift each value
2889 /// in operand \a __a.
2890 /// \returns A 128-bit integer vector containing the left-shifted values.
2891 static __inline__ __m128i __DEFAULT_FN_ATTRS
2892 _mm_slli_epi64(__m128i __a, int __count)
2893 {
2894  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2895 }
2896 
2897 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2898 /// by the specified number of bits. Low-order bits are cleared.
2899 ///
2900 /// \headerfile <x86intrin.h>
2901 ///
2902 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2903 ///
2904 /// \param __a
2905 /// A 128-bit integer vector containing the source operand.
2906 /// \param __count
2907 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2908 /// to left-shift each value in operand \a __a.
2909 /// \returns A 128-bit integer vector containing the left-shifted values.
2910 static __inline__ __m128i __DEFAULT_FN_ATTRS
2911 _mm_sll_epi64(__m128i __a, __m128i __count)
2912 {
2913  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2914 }
2915 
2916 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2917 /// by the specified number of bits. High-order bits are filled with the sign
2918 /// bit of the initial value.
2919 ///
2920 /// \headerfile <x86intrin.h>
2921 ///
2922 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2923 ///
2924 /// \param __a
2925 /// A 128-bit integer vector containing the source operand.
2926 /// \param __count
2927 /// An integer value specifying the number of bits to right-shift each value
2928 /// in operand \a __a.
2929 /// \returns A 128-bit integer vector containing the right-shifted values.
2930 static __inline__ __m128i __DEFAULT_FN_ATTRS
2931 _mm_srai_epi16(__m128i __a, int __count)
2932 {
2933  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2934 }
2935 
2936 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2937 /// by the specified number of bits. High-order bits are filled with the sign
2938 /// bit of the initial value.
2939 ///
2940 /// \headerfile <x86intrin.h>
2941 ///
2942 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2943 ///
2944 /// \param __a
2945 /// A 128-bit integer vector containing the source operand.
2946 /// \param __count
2947 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2948 /// to right-shift each value in operand \a __a.
2949 /// \returns A 128-bit integer vector containing the right-shifted values.
2950 static __inline__ __m128i __DEFAULT_FN_ATTRS
2951 _mm_sra_epi16(__m128i __a, __m128i __count)
2952 {
2953  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2954 }
2955 
2956 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2957 /// by the specified number of bits. High-order bits are filled with the sign
2958 /// bit of the initial value.
2959 ///
2960 /// \headerfile <x86intrin.h>
2961 ///
2962 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2963 ///
2964 /// \param __a
2965 /// A 128-bit integer vector containing the source operand.
2966 /// \param __count
2967 /// An integer value specifying the number of bits to right-shift each value
2968 /// in operand \a __a.
2969 /// \returns A 128-bit integer vector containing the right-shifted values.
2970 static __inline__ __m128i __DEFAULT_FN_ATTRS
2971 _mm_srai_epi32(__m128i __a, int __count)
2972 {
2973  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2974 }
2975 
2976 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2977 /// by the specified number of bits. High-order bits are filled with the sign
2978 /// bit of the initial value.
2979 ///
2980 /// \headerfile <x86intrin.h>
2981 ///
2982 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2983 ///
2984 /// \param __a
2985 /// A 128-bit integer vector containing the source operand.
2986 /// \param __count
2987 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2988 /// to right-shift each value in operand \a __a.
2989 /// \returns A 128-bit integer vector containing the right-shifted values.
2990 static __inline__ __m128i __DEFAULT_FN_ATTRS
2991 _mm_sra_epi32(__m128i __a, __m128i __count)
2992 {
2993  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2994 }
2995 
2996 /// Right-shifts the 128-bit integer vector operand by the specified
2997 /// number of bytes. High-order bits are cleared.
2998 ///
2999 /// \headerfile <x86intrin.h>
3000 ///
3001 /// \code
3002 /// __m128i _mm_srli_si128(__m128i a, const int imm);
3003 /// \endcode
3004 ///
3005 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3006 ///
3007 /// \param a
3008 /// A 128-bit integer vector containing the source operand.
3009 /// \param imm
3010 /// An immediate value specifying the number of bytes to right-shift operand
3011 /// \a a.
3012 /// \returns A 128-bit integer vector containing the right-shifted value.
3013 #define _mm_srli_si128(a, imm) \
3014  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3015 
3016 #define _mm_bsrli_si128(a, imm) \
3017  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3018 
3019 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3020 /// operand by the specified number of bits. High-order bits are cleared.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3025 ///
3026 /// \param __a
3027 /// A 128-bit integer vector containing the source operand.
3028 /// \param __count
3029 /// An integer value specifying the number of bits to right-shift each value
3030 /// in operand \a __a.
3031 /// \returns A 128-bit integer vector containing the right-shifted values.
3032 static __inline__ __m128i __DEFAULT_FN_ATTRS
3033 _mm_srli_epi16(__m128i __a, int __count)
3034 {
3035  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3036 }
3037 
3038 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3039 /// operand by the specified number of bits. High-order bits are cleared.
3040 ///
3041 /// \headerfile <x86intrin.h>
3042 ///
3043 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3044 ///
3045 /// \param __a
3046 /// A 128-bit integer vector containing the source operand.
3047 /// \param __count
3048 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3049 /// to right-shift each value in operand \a __a.
3050 /// \returns A 128-bit integer vector containing the right-shifted values.
3051 static __inline__ __m128i __DEFAULT_FN_ATTRS
3052 _mm_srl_epi16(__m128i __a, __m128i __count)
3053 {
3054  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3055 }
3056 
3057 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3058 /// operand by the specified number of bits. High-order bits are cleared.
3059 ///
3060 /// \headerfile <x86intrin.h>
3061 ///
3062 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3063 ///
3064 /// \param __a
3065 /// A 128-bit integer vector containing the source operand.
3066 /// \param __count
3067 /// An integer value specifying the number of bits to right-shift each value
3068 /// in operand \a __a.
3069 /// \returns A 128-bit integer vector containing the right-shifted values.
3070 static __inline__ __m128i __DEFAULT_FN_ATTRS
3071 _mm_srli_epi32(__m128i __a, int __count)
3072 {
3073  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3074 }
3075 
3076 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3077 /// operand by the specified number of bits. High-order bits are cleared.
3078 ///
3079 /// \headerfile <x86intrin.h>
3080 ///
3081 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3082 ///
3083 /// \param __a
3084 /// A 128-bit integer vector containing the source operand.
3085 /// \param __count
3086 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3087 /// to right-shift each value in operand \a __a.
3088 /// \returns A 128-bit integer vector containing the right-shifted values.
3089 static __inline__ __m128i __DEFAULT_FN_ATTRS
3090 _mm_srl_epi32(__m128i __a, __m128i __count)
3091 {
3092  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3093 }
3094 
3095 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3096 /// operand by the specified number of bits. High-order bits are cleared.
3097 ///
3098 /// \headerfile <x86intrin.h>
3099 ///
3100 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3101 ///
3102 /// \param __a
3103 /// A 128-bit integer vector containing the source operand.
3104 /// \param __count
3105 /// An integer value specifying the number of bits to right-shift each value
3106 /// in operand \a __a.
3107 /// \returns A 128-bit integer vector containing the right-shifted values.
3108 static __inline__ __m128i __DEFAULT_FN_ATTRS
3109 _mm_srli_epi64(__m128i __a, int __count)
3110 {
3111  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3112 }
3113 
3114 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3115 /// operand by the specified number of bits. High-order bits are cleared.
3116 ///
3117 /// \headerfile <x86intrin.h>
3118 ///
3119 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3120 ///
3121 /// \param __a
3122 /// A 128-bit integer vector containing the source operand.
3123 /// \param __count
3124 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3125 /// to right-shift each value in operand \a __a.
3126 /// \returns A 128-bit integer vector containing the right-shifted values.
3127 static __inline__ __m128i __DEFAULT_FN_ATTRS
3128 _mm_srl_epi64(__m128i __a, __m128i __count)
3129 {
3130  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3131 }
3132 
3133 /// Compares each of the corresponding 8-bit values of the 128-bit
3134 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3135 /// for true.
3136 ///
3137 /// \headerfile <x86intrin.h>
3138 ///
3139 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3140 ///
3141 /// \param __a
3142 /// A 128-bit integer vector.
3143 /// \param __b
3144 /// A 128-bit integer vector.
3145 /// \returns A 128-bit integer vector containing the comparison results.
3146 static __inline__ __m128i __DEFAULT_FN_ATTRS
3147 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
3148 {
3149  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3150 }
3151 
3152 /// Compares each of the corresponding 16-bit values of the 128-bit
3153 /// integer vectors for equality. Each comparison yields 0x0 for false,
3154 /// 0xFFFF for true.
3155 ///
3156 /// \headerfile <x86intrin.h>
3157 ///
3158 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3159 ///
3160 /// \param __a
3161 /// A 128-bit integer vector.
3162 /// \param __b
3163 /// A 128-bit integer vector.
3164 /// \returns A 128-bit integer vector containing the comparison results.
3165 static __inline__ __m128i __DEFAULT_FN_ATTRS
3166 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
3167 {
3168  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3169 }
3170 
3171 /// Compares each of the corresponding 32-bit values of the 128-bit
3172 /// integer vectors for equality. Each comparison yields 0x0 for false,
3173 /// 0xFFFFFFFF for true.
3174 ///
3175 /// \headerfile <x86intrin.h>
3176 ///
3177 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3178 ///
3179 /// \param __a
3180 /// A 128-bit integer vector.
3181 /// \param __b
3182 /// A 128-bit integer vector.
3183 /// \returns A 128-bit integer vector containing the comparison results.
3184 static __inline__ __m128i __DEFAULT_FN_ATTRS
3185 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
3186 {
3187  return (__m128i)((__v4si)__a == (__v4si)__b);
3188 }
3189 
3190 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3191 /// integer vectors to determine if the values in the first operand are
3192 /// greater than those in the second operand. Each comparison yields 0x0 for
3193 /// false, 0xFF for true.
3194 ///
3195 /// \headerfile <x86intrin.h>
3196 ///
3197 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3198 ///
3199 /// \param __a
3200 /// A 128-bit integer vector.
3201 /// \param __b
3202 /// A 128-bit integer vector.
3203 /// \returns A 128-bit integer vector containing the comparison results.
3204 static __inline__ __m128i __DEFAULT_FN_ATTRS
3205 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
3206 {
3207  /* This function always performs a signed comparison, but __v16qi is a char
3208  which may be signed or unsigned, so use __v16qs. */
3209  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3210 }
3211 
3212 /// Compares each of the corresponding signed 16-bit values of the
3213 /// 128-bit integer vectors to determine if the values in the first operand
3214 /// are greater than those in the second operand.
3215 ///
3216 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3217 ///
3218 /// \headerfile <x86intrin.h>
3219 ///
3220 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3221 ///
3222 /// \param __a
3223 /// A 128-bit integer vector.
3224 /// \param __b
3225 /// A 128-bit integer vector.
3226 /// \returns A 128-bit integer vector containing the comparison results.
3227 static __inline__ __m128i __DEFAULT_FN_ATTRS
3228 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
3229 {
3230  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3231 }
3232 
3233 /// Compares each of the corresponding signed 32-bit values of the
3234 /// 128-bit integer vectors to determine if the values in the first operand
3235 /// are greater than those in the second operand.
3236 ///
3237 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3238 ///
3239 /// \headerfile <x86intrin.h>
3240 ///
3241 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3242 ///
3243 /// \param __a
3244 /// A 128-bit integer vector.
3245 /// \param __b
3246 /// A 128-bit integer vector.
3247 /// \returns A 128-bit integer vector containing the comparison results.
3248 static __inline__ __m128i __DEFAULT_FN_ATTRS
3249 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
3250 {
3251  return (__m128i)((__v4si)__a > (__v4si)__b);
3252 }
3253 
3254 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3255 /// integer vectors to determine if the values in the first operand are less
3256 /// than those in the second operand.
3257 ///
3258 /// Each comparison yields 0x0 for false, 0xFF for true.
3259 ///
3260 /// \headerfile <x86intrin.h>
3261 ///
3262 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3263 ///
3264 /// \param __a
3265 /// A 128-bit integer vector.
3266 /// \param __b
3267 /// A 128-bit integer vector.
3268 /// \returns A 128-bit integer vector containing the comparison results.
3269 static __inline__ __m128i __DEFAULT_FN_ATTRS
3270 _mm_cmplt_epi8(__m128i __a, __m128i __b)
3271 {
3272  return _mm_cmpgt_epi8(__b, __a);
3273 }
3274 
3275 /// Compares each of the corresponding signed 16-bit values of the
3276 /// 128-bit integer vectors to determine if the values in the first operand
3277 /// are less than those in the second operand.
3278 ///
3279 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3280 ///
3281 /// \headerfile <x86intrin.h>
3282 ///
3283 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3284 ///
3285 /// \param __a
3286 /// A 128-bit integer vector.
3287 /// \param __b
3288 /// A 128-bit integer vector.
3289 /// \returns A 128-bit integer vector containing the comparison results.
3290 static __inline__ __m128i __DEFAULT_FN_ATTRS
3291 _mm_cmplt_epi16(__m128i __a, __m128i __b)
3292 {
3293  return _mm_cmpgt_epi16(__b, __a);
3294 }
3295 
3296 /// Compares each of the corresponding signed 32-bit values of the
3297 /// 128-bit integer vectors to determine if the values in the first operand
3298 /// are less than those in the second operand.
3299 ///
3300 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3301 ///
3302 /// \headerfile <x86intrin.h>
3303 ///
3304 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3305 ///
3306 /// \param __a
3307 /// A 128-bit integer vector.
3308 /// \param __b
3309 /// A 128-bit integer vector.
3310 /// \returns A 128-bit integer vector containing the comparison results.
3311 static __inline__ __m128i __DEFAULT_FN_ATTRS
3312 _mm_cmplt_epi32(__m128i __a, __m128i __b)
3313 {
3314  return _mm_cmpgt_epi32(__b, __a);
3315 }
3316 
3317 #ifdef __x86_64__
3318 /// Converts a 64-bit signed integer value from the second operand into a
3319 /// double-precision value and returns it in the lower element of a [2 x
3320 /// double] vector; the upper element of the returned vector is copied from
3321 /// the upper element of the first operand.
3322 ///
3323 /// \headerfile <x86intrin.h>
3324 ///
3325 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3326 ///
3327 /// \param __a
3328 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3329 /// copied to the upper 64 bits of the destination.
3330 /// \param __b
3331 /// A 64-bit signed integer operand containing the value to be converted.
3332 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3333 /// converted value of the second operand. The upper 64 bits are copied from
3334 /// the upper 64 bits of the first operand.
3335 static __inline__ __m128d __DEFAULT_FN_ATTRS
3336 _mm_cvtsi64_sd(__m128d __a, long long __b)
3337 {
3338  __a[0] = __b;
3339  return __a;
3340 }
3341 
3342 /// Converts the first (lower) element of a vector of [2 x double] into a
3343 /// 64-bit signed integer value, according to the current rounding mode.
3344 ///
3345 /// \headerfile <x86intrin.h>
3346 ///
3347 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3348 ///
3349 /// \param __a
3350 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3351 /// conversion.
3352 /// \returns A 64-bit signed integer containing the converted value.
3353 static __inline__ long long __DEFAULT_FN_ATTRS
3354 _mm_cvtsd_si64(__m128d __a)
3355 {
3356  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3357 }
3358 
3359 /// Converts the first (lower) element of a vector of [2 x double] into a
3360 /// 64-bit signed integer value, truncating the result when it is inexact.
3361 ///
3362 /// \headerfile <x86intrin.h>
3363 ///
3364 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3365 /// instruction.
3366 ///
3367 /// \param __a
3368 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3369 /// conversion.
3370 /// \returns A 64-bit signed integer containing the converted value.
3371 static __inline__ long long __DEFAULT_FN_ATTRS
3372 _mm_cvttsd_si64(__m128d __a)
3373 {
3374  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3375 }
3376 #endif
3377 
3378 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3379 ///
3380 /// \headerfile <x86intrin.h>
3381 ///
3382 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3383 ///
3384 /// \param __a
3385 /// A 128-bit integer vector.
3386 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3387 static __inline__ __m128 __DEFAULT_FN_ATTRS
3388 _mm_cvtepi32_ps(__m128i __a)
3389 {
3390  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3391 }
3392 
3393 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3394 ///
3395 /// \headerfile <x86intrin.h>
3396 ///
3397 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3398 ///
3399 /// \param __a
3400 /// A 128-bit vector of [4 x float].
3401 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3402 /// values.
3403 static __inline__ __m128i __DEFAULT_FN_ATTRS
3404 _mm_cvtps_epi32(__m128 __a)
3405 {
3406  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3407 }
3408 
3409 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3410 /// truncating the result when it is inexact.
3411 ///
3412 /// \headerfile <x86intrin.h>
3413 ///
3414 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3415 /// instruction.
3416 ///
3417 /// \param __a
3418 /// A 128-bit vector of [4 x float].
3419 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3420 static __inline__ __m128i __DEFAULT_FN_ATTRS
3421 _mm_cvttps_epi32(__m128 __a)
3422 {
3423  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3424 }
3425 
3426 /// Returns a vector of [4 x i32] where the lowest element is the input
3427 /// operand and the remaining elements are zero.
3428 ///
3429 /// \headerfile <x86intrin.h>
3430 ///
3431 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3432 ///
3433 /// \param __a
3434 /// A 32-bit signed integer operand.
3435 /// \returns A 128-bit vector of [4 x i32].
3436 static __inline__ __m128i __DEFAULT_FN_ATTRS
3438 {
3439  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3440 }
3441 
3442 #ifdef __x86_64__
3443 /// Returns a vector of [2 x i64] where the lower element is the input
3444 /// operand and the upper element is zero.
3445 ///
3446 /// \headerfile <x86intrin.h>
3447 ///
3448 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3449 ///
3450 /// \param __a
3451 /// A 64-bit signed integer operand containing the value to be converted.
3452 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3453 static __inline__ __m128i __DEFAULT_FN_ATTRS
3454 _mm_cvtsi64_si128(long long __a)
3455 {
3456  return __extension__ (__m128i)(__v2di){ __a, 0 };
3457 }
3458 #endif
3459 
3460 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3461 /// 32-bit signed integer value.
3462 ///
3463 /// \headerfile <x86intrin.h>
3464 ///
3465 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3466 ///
3467 /// \param __a
3468 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3469 /// destination.
3470 /// \returns A 32-bit signed integer containing the moved value.
3471 static __inline__ int __DEFAULT_FN_ATTRS
3472 _mm_cvtsi128_si32(__m128i __a)
3473 {
3474  __v4si __b = (__v4si)__a;
3475  return __b[0];
3476 }
3477 
3478 #ifdef __x86_64__
3479 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3480 /// 64-bit signed integer value.
3481 ///
3482 /// \headerfile <x86intrin.h>
3483 ///
3484 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3485 ///
3486 /// \param __a
3487 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3488 /// destination.
3489 /// \returns A 64-bit signed integer containing the moved value.
3490 static __inline__ long long __DEFAULT_FN_ATTRS
3491 _mm_cvtsi128_si64(__m128i __a)
3492 {
3493  return __a[0];
3494 }
3495 #endif
3496 
3497 /// Moves packed integer values from an aligned 128-bit memory location
3498 /// to elements in a 128-bit integer vector.
3499 ///
3500 /// \headerfile <x86intrin.h>
3501 ///
3502 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3503 ///
3504 /// \param __p
3505 /// An aligned pointer to a memory location containing integer values.
3506 /// \returns A 128-bit integer vector containing the moved values.
3507 static __inline__ __m128i __DEFAULT_FN_ATTRS
3508 _mm_load_si128(__m128i const *__p)
3509 {
3510  return *__p;
3511 }
3512 
3513 /// Moves packed integer values from an unaligned 128-bit memory location
3514 /// to elements in a 128-bit integer vector.
3515 ///
3516 /// \headerfile <x86intrin.h>
3517 ///
3518 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3519 ///
3520 /// \param __p
3521 /// A pointer to a memory location containing integer values.
3522 /// \returns A 128-bit integer vector containing the moved values.
3523 static __inline__ __m128i __DEFAULT_FN_ATTRS
3524 _mm_loadu_si128(__m128i const *__p)
3525 {
3526  struct __loadu_si128 {
3527  __m128i __v;
3528  } __attribute__((__packed__, __may_alias__));
3529  return ((struct __loadu_si128*)__p)->__v;
3530 }
3531 
3532 /// Returns a vector of [2 x i64] where the lower element is taken from
3533 /// the lower element of the operand, and the upper element is zero.
3534 ///
3535 /// \headerfile <x86intrin.h>
3536 ///
3537 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3538 ///
3539 /// \param __p
3540 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3541 /// the destination.
3542 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3543 /// moved value. The higher order bits are cleared.
3544 static __inline__ __m128i __DEFAULT_FN_ATTRS
3545 _mm_loadl_epi64(__m128i const *__p)
3546 {
3547  struct __mm_loadl_epi64_struct {
3548  long long __u;
3549  } __attribute__((__packed__, __may_alias__));
3550  return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3551 }
3552 
3553 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3554 /// This could be used as an argument to another intrinsic function where the
3555 /// argument is required but the value is not actually used.
3556 ///
3557 /// \headerfile <x86intrin.h>
3558 ///
3559 /// This intrinsic has no corresponding instruction.
3560 ///
3561 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3562 static __inline__ __m128i __DEFAULT_FN_ATTRS
3564 {
3565  return (__m128i)__builtin_ia32_undef128();
3566 }
3567 
3568 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3569 /// the specified 64-bit integer values.
3570 ///
3571 /// \headerfile <x86intrin.h>
3572 ///
3573 /// This intrinsic is a utility function and does not correspond to a specific
3574 /// instruction.
3575 ///
3576 /// \param __q1
3577 /// A 64-bit integer value used to initialize the upper 64 bits of the
3578 /// destination vector of [2 x i64].
3579 /// \param __q0
3580 /// A 64-bit integer value used to initialize the lower 64 bits of the
3581 /// destination vector of [2 x i64].
3582 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3583 /// provided in the operands.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS
3585 _mm_set_epi64x(long long __q1, long long __q0)
3586 {
3587  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3588 }
3589 
3590 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3591 /// the specified 64-bit integer values.
3592 ///
3593 /// \headerfile <x86intrin.h>
3594 ///
3595 /// This intrinsic is a utility function and does not correspond to a specific
3596 /// instruction.
3597 ///
3598 /// \param __q1
3599 /// A 64-bit integer value used to initialize the upper 64 bits of the
3600 /// destination vector of [2 x i64].
3601 /// \param __q0
3602 /// A 64-bit integer value used to initialize the lower 64 bits of the
3603 /// destination vector of [2 x i64].
3604 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3605 /// provided in the operands.
3606 static __inline__ __m128i __DEFAULT_FN_ATTRS
3607 _mm_set_epi64(__m64 __q1, __m64 __q0)
3608 {
3609  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3610 }
3611 
3612 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3613 /// the specified 32-bit integer values.
3614 ///
3615 /// \headerfile <x86intrin.h>
3616 ///
3617 /// This intrinsic is a utility function and does not correspond to a specific
3618 /// instruction.
3619 ///
3620 /// \param __i3
3621 /// A 32-bit integer value used to initialize bits [127:96] of the
3622 /// destination vector.
3623 /// \param __i2
3624 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3625 /// vector.
3626 /// \param __i1
3627 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3628 /// vector.
3629 /// \param __i0
3630 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3631 /// vector.
3632 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3633 /// provided in the operands.
3634 static __inline__ __m128i __DEFAULT_FN_ATTRS
3635 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3636 {
3637  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3638 }
3639 
3640 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3641 /// the specified 16-bit integer values.
3642 ///
3643 /// \headerfile <x86intrin.h>
3644 ///
3645 /// This intrinsic is a utility function and does not correspond to a specific
3646 /// instruction.
3647 ///
3648 /// \param __w7
3649 /// A 16-bit integer value used to initialize bits [127:112] of the
3650 /// destination vector.
3651 /// \param __w6
3652 /// A 16-bit integer value used to initialize bits [111:96] of the
3653 /// destination vector.
3654 /// \param __w5
3655 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3656 /// vector.
3657 /// \param __w4
3658 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3659 /// vector.
3660 /// \param __w3
3661 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3662 /// vector.
3663 /// \param __w2
3664 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3665 /// vector.
3666 /// \param __w1
3667 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3668 /// vector.
3669 /// \param __w0
3670 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3671 /// vector.
3672 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3673 /// provided in the operands.
3674 static __inline__ __m128i __DEFAULT_FN_ATTRS
3675 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3676 {
3677  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3678 }
3679 
3680 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3681 /// the specified 8-bit integer values.
3682 ///
3683 /// \headerfile <x86intrin.h>
3684 ///
3685 /// This intrinsic is a utility function and does not correspond to a specific
3686 /// instruction.
3687 ///
3688 /// \param __b15
3689 /// Initializes bits [127:120] of the destination vector.
3690 /// \param __b14
3691 /// Initializes bits [119:112] of the destination vector.
3692 /// \param __b13
3693 /// Initializes bits [111:104] of the destination vector.
3694 /// \param __b12
3695 /// Initializes bits [103:96] of the destination vector.
3696 /// \param __b11
3697 /// Initializes bits [95:88] of the destination vector.
3698 /// \param __b10
3699 /// Initializes bits [87:80] of the destination vector.
3700 /// \param __b9
3701 /// Initializes bits [79:72] of the destination vector.
3702 /// \param __b8
3703 /// Initializes bits [71:64] of the destination vector.
3704 /// \param __b7
3705 /// Initializes bits [63:56] of the destination vector.
3706 /// \param __b6
3707 /// Initializes bits [55:48] of the destination vector.
3708 /// \param __b5
3709 /// Initializes bits [47:40] of the destination vector.
3710 /// \param __b4
3711 /// Initializes bits [39:32] of the destination vector.
3712 /// \param __b3
3713 /// Initializes bits [31:24] of the destination vector.
3714 /// \param __b2
3715 /// Initializes bits [23:16] of the destination vector.
3716 /// \param __b1
3717 /// Initializes bits [15:8] of the destination vector.
3718 /// \param __b0
3719 /// Initializes bits [7:0] of the destination vector.
3720 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3721 /// provided in the operands.
3722 static __inline__ __m128i __DEFAULT_FN_ATTRS
3723 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3724 {
3725  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3726 }
3727 
3728 /// Initializes both values in a 128-bit integer vector with the
3729 /// specified 64-bit integer value.
3730 ///
3731 /// \headerfile <x86intrin.h>
3732 ///
3733 /// This intrinsic is a utility function and does not correspond to a specific
3734 /// instruction.
3735 ///
3736 /// \param __q
3737 /// Integer value used to initialize the elements of the destination integer
3738 /// vector.
3739 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3740 /// elements containing the value provided in the operand.
3741 static __inline__ __m128i __DEFAULT_FN_ATTRS
3742 _mm_set1_epi64x(long long __q)
3743 {
3744  return _mm_set_epi64x(__q, __q);
3745 }
3746 
3747 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3748 /// specified 64-bit value.
3749 ///
3750 /// \headerfile <x86intrin.h>
3751 ///
3752 /// This intrinsic is a utility function and does not correspond to a specific
3753 /// instruction.
3754 ///
3755 /// \param __q
3756 /// A 64-bit value used to initialize the elements of the destination integer
3757 /// vector.
3758 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3759 /// containing the value provided in the operand.
3760 static __inline__ __m128i __DEFAULT_FN_ATTRS
3761 _mm_set1_epi64(__m64 __q)
3762 {
3763  return _mm_set_epi64(__q, __q);
3764 }
3765 
3766 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3767 /// specified 32-bit value.
3768 ///
3769 /// \headerfile <x86intrin.h>
3770 ///
3771 /// This intrinsic is a utility function and does not correspond to a specific
3772 /// instruction.
3773 ///
3774 /// \param __i
3775 /// A 32-bit value used to initialize the elements of the destination integer
3776 /// vector.
3777 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3778 /// containing the value provided in the operand.
3779 static __inline__ __m128i __DEFAULT_FN_ATTRS
3781 {
3782  return _mm_set_epi32(__i, __i, __i, __i);
3783 }
3784 
3785 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3786 /// specified 16-bit value.
3787 ///
3788 /// \headerfile <x86intrin.h>
3789 ///
3790 /// This intrinsic is a utility function and does not correspond to a specific
3791 /// instruction.
3792 ///
3793 /// \param __w
3794 /// A 16-bit value used to initialize the elements of the destination integer
3795 /// vector.
3796 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3797 /// containing the value provided in the operand.
3798 static __inline__ __m128i __DEFAULT_FN_ATTRS
3799 _mm_set1_epi16(short __w)
3800 {
3801  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3802 }
3803 
3804 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3805 /// specified 8-bit value.
3806 ///
3807 /// \headerfile <x86intrin.h>
3808 ///
3809 /// This intrinsic is a utility function and does not correspond to a specific
3810 /// instruction.
3811 ///
3812 /// \param __b
3813 /// An 8-bit value used to initialize the elements of the destination integer
3814 /// vector.
3815 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3816 /// containing the value provided in the operand.
3817 static __inline__ __m128i __DEFAULT_FN_ATTRS
3818 _mm_set1_epi8(char __b)
3819 {
3820  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3821 }
3822 
3823 /// Constructs a 128-bit integer vector, initialized in reverse order
3824 /// with the specified 64-bit integral values.
3825 ///
3826 /// \headerfile <x86intrin.h>
3827 ///
3828 /// This intrinsic does not correspond to a specific instruction.
3829 ///
3830 /// \param __q0
3831 /// A 64-bit integral value used to initialize the lower 64 bits of the
3832 /// result.
3833 /// \param __q1
3834 /// A 64-bit integral value used to initialize the upper 64 bits of the
3835 /// result.
3836 /// \returns An initialized 128-bit integer vector.
3837 static __inline__ __m128i __DEFAULT_FN_ATTRS
3838 _mm_setr_epi64(__m64 __q0, __m64 __q1)
3839 {
3840  return _mm_set_epi64(__q1, __q0);
3841 }
3842 
3843 /// Constructs a 128-bit integer vector, initialized in reverse order
3844 /// with the specified 32-bit integral values.
3845 ///
3846 /// \headerfile <x86intrin.h>
3847 ///
3848 /// This intrinsic is a utility function and does not correspond to a specific
3849 /// instruction.
3850 ///
3851 /// \param __i0
3852 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3853 /// \param __i1
3854 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3855 /// \param __i2
3856 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3857 /// \param __i3
3858 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3859 /// \returns An initialized 128-bit integer vector.
3860 static __inline__ __m128i __DEFAULT_FN_ATTRS
3861 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3862 {
3863  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3864 }
3865 
3866 /// Constructs a 128-bit integer vector, initialized in reverse order
3867 /// with the specified 16-bit integral values.
3868 ///
3869 /// \headerfile <x86intrin.h>
3870 ///
3871 /// This intrinsic is a utility function and does not correspond to a specific
3872 /// instruction.
3873 ///
3874 /// \param __w0
3875 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3876 /// \param __w1
3877 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3878 /// \param __w2
3879 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3880 /// \param __w3
3881 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3882 /// \param __w4
3883 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3884 /// \param __w5
3885 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3886 /// \param __w6
3887 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3888 /// \param __w7
3889 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3890 /// \returns An initialized 128-bit integer vector.
3891 static __inline__ __m128i __DEFAULT_FN_ATTRS
3892 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3893 {
3894  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3895 }
3896 
3897 /// Constructs a 128-bit integer vector, initialized in reverse order
3898 /// with the specified 8-bit integral values.
3899 ///
3900 /// \headerfile <x86intrin.h>
3901 ///
3902 /// This intrinsic is a utility function and does not correspond to a specific
3903 /// instruction.
3904 ///
3905 /// \param __b0
3906 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3907 /// \param __b1
3908 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3909 /// \param __b2
3910 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3911 /// \param __b3
3912 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3913 /// \param __b4
3914 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3915 /// \param __b5
3916 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3917 /// \param __b6
3918 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3919 /// \param __b7
3920 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3921 /// \param __b8
3922 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3923 /// \param __b9
3924 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3925 /// \param __b10
3926 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3927 /// \param __b11
3928 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3929 /// \param __b12
3930 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3931 /// \param __b13
3932 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3933 /// \param __b14
3934 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3935 /// \param __b15
3936 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3937 /// \returns An initialized 128-bit integer vector.
3938 static __inline__ __m128i __DEFAULT_FN_ATTRS
3939 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3940 {
3941  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3942 }
3943 
3944 /// Creates a 128-bit integer vector initialized to zero.
3945 ///
3946 /// \headerfile <x86intrin.h>
3947 ///
3948 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3949 ///
3950 /// \returns An initialized 128-bit integer vector with all elements set to
3951 /// zero.
3952 static __inline__ __m128i __DEFAULT_FN_ATTRS
3954 {
3955  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
3956 }
3957 
3958 /// Stores a 128-bit integer vector to a memory location aligned on a
3959 /// 128-bit boundary.
3960 ///
3961 /// \headerfile <x86intrin.h>
3962 ///
3963 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3964 ///
3965 /// \param __p
3966 /// A pointer to an aligned memory location that will receive the integer
3967 /// values.
3968 /// \param __b
3969 /// A 128-bit integer vector containing the values to be moved.
3970 static __inline__ void __DEFAULT_FN_ATTRS
3971 _mm_store_si128(__m128i *__p, __m128i __b)
3972 {
3973  *__p = __b;
3974 }
3975 
3976 /// Stores a 128-bit integer vector to an unaligned memory location.
3977 ///
3978 /// \headerfile <x86intrin.h>
3979 ///
3980 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3981 ///
3982 /// \param __p
3983 /// A pointer to a memory location that will receive the integer values.
3984 /// \param __b
3985 /// A 128-bit integer vector containing the values to be moved.
3986 static __inline__ void __DEFAULT_FN_ATTRS
3987 _mm_storeu_si128(__m128i *__p, __m128i __b)
3988 {
3989  struct __storeu_si128 {
3990  __m128i __v;
3991  } __attribute__((__packed__, __may_alias__));
3992  ((struct __storeu_si128*)__p)->__v = __b;
3993 }
3994 
3995 /// Moves bytes selected by the mask from the first operand to the
3996 /// specified unaligned memory location. When a mask bit is 1, the
3997 /// corresponding byte is written, otherwise it is not written.
3998 ///
3999 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4000 /// used again soon). Exception and trap behavior for elements not selected
4001 /// for storage to memory are implementation dependent.
4002 ///
4003 /// \headerfile <x86intrin.h>
4004 ///
4005 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4006 /// instruction.
4007 ///
4008 /// \param __d
4009 /// A 128-bit integer vector containing the values to be moved.
4010 /// \param __n
4011 /// A 128-bit integer vector containing the mask. The most significant bit of
4012 /// each byte represents the mask bits.
4013 /// \param __p
4014 /// A pointer to an unaligned 128-bit memory location where the specified
4015 /// values are moved.
4016 static __inline__ void __DEFAULT_FN_ATTRS
4017 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4018 {
4019  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4020 }
4021 
4022 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4023 /// a memory location.
4024 ///
4025 /// \headerfile <x86intrin.h>
4026 ///
4027 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4028 ///
4029 /// \param __p
4030 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
4031 /// of the integer vector parameter.
4032 /// \param __a
4033 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4034 /// value to be stored.
4035 static __inline__ void __DEFAULT_FN_ATTRS
4036 _mm_storel_epi64(__m128i *__p, __m128i __a)
4037 {
4038  struct __mm_storel_epi64_struct {
4039  long long __u;
4040  } __attribute__((__packed__, __may_alias__));
4041  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4042 }
4043 
4044 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4045 /// aligned memory location.
4046 ///
4047 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4048 /// used again soon).
4049 ///
4050 /// \headerfile <x86intrin.h>
4051 ///
4052 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4053 ///
4054 /// \param __p
4055 /// A pointer to the 128-bit aligned memory location used to store the value.
4056 /// \param __a
4057 /// A vector of [2 x double] containing the 64-bit values to be stored.
4058 static __inline__ void __DEFAULT_FN_ATTRS
4059 _mm_stream_pd(double *__p, __m128d __a)
4060 {
4061  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4062 }
4063 
4064 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4065 ///
4066 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4067 /// used again soon).
4068 ///
4069 /// \headerfile <x86intrin.h>
4070 ///
4071 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4072 ///
4073 /// \param __p
4074 /// A pointer to the 128-bit aligned memory location used to store the value.
4075 /// \param __a
4076 /// A 128-bit integer vector containing the values to be stored.
4077 static __inline__ void __DEFAULT_FN_ATTRS
4078 _mm_stream_si128(__m128i *__p, __m128i __a)
4079 {
4080  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4081 }
4082 
4083 /// Stores a 32-bit integer value in the specified memory location.
4084 ///
4085 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4086 /// used again soon).
4087 ///
4088 /// \headerfile <x86intrin.h>
4089 ///
4090 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4091 ///
4092 /// \param __p
4093 /// A pointer to the 32-bit memory location used to store the value.
4094 /// \param __a
4095 /// A 32-bit integer containing the value to be stored.
4096 static __inline__ void __DEFAULT_FN_ATTRS
4097 _mm_stream_si32(int *__p, int __a)
4098 {
4099  __builtin_ia32_movnti(__p, __a);
4100 }
4101 
4102 #ifdef __x86_64__
4103 /// Stores a 64-bit integer value in the specified memory location.
4104 ///
4105 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4106 /// used again soon).
4107 ///
4108 /// \headerfile <x86intrin.h>
4109 ///
4110 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4111 ///
4112 /// \param __p
4113 /// A pointer to the 64-bit memory location used to store the value.
4114 /// \param __a
4115 /// A 64-bit integer containing the value to be stored.
4116 static __inline__ void __DEFAULT_FN_ATTRS
4117 _mm_stream_si64(long long *__p, long long __a)
4118 {
4119  __builtin_ia32_movnti64(__p, __a);
4120 }
4121 #endif
4122 
4123 #if defined(__cplusplus)
4124 extern "C" {
4125 #endif
4126 
4127 /// The cache line containing \a __p is flushed and invalidated from all
4128 /// caches in the coherency domain.
4129 ///
4130 /// \headerfile <x86intrin.h>
4131 ///
4132 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4133 ///
4134 /// \param __p
4135 /// A pointer to the memory location used to identify the cache line to be
4136 /// flushed.
4137 void _mm_clflush(void const * __p);
4138 
4139 /// Forces strong memory ordering (serialization) between load
4140 /// instructions preceding this instruction and load instructions following
4141 /// this instruction, ensuring the system completes all previous loads before
4142 /// executing subsequent loads.
4143 ///
4144 /// \headerfile <x86intrin.h>
4145 ///
4146 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4147 ///
4148 void _mm_lfence(void);
4149 
4150 /// Forces strong memory ordering (serialization) between load and store
4151 /// instructions preceding this instruction and load and store instructions
4152 /// following this instruction, ensuring that the system completes all
4153 /// previous memory accesses before executing subsequent memory accesses.
4154 ///
4155 /// \headerfile <x86intrin.h>
4156 ///
4157 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4158 ///
4159 void _mm_mfence(void);
4160 
4161 #if defined(__cplusplus)
4162 } // extern "C"
4163 #endif
4164 
4165 /// Converts 16-bit signed integers from both 128-bit integer vector
4166 /// operands into 8-bit signed integers, and packs the results into the
4167 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4168 /// Negative values less than 0x80 are saturated to 0x80.
4169 ///
4170 /// \headerfile <x86intrin.h>
4171 ///
4172 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4173 ///
4174 /// \param __a
4175 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4176 /// a signed integer and is converted to a 8-bit signed integer with
4177 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4178 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4179 /// written to the lower 64 bits of the result.
4180 /// \param __b
4181 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4182 /// a signed integer and is converted to a 8-bit signed integer with
4183 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4184 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4185 /// written to the higher 64 bits of the result.
4186 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4187 static __inline__ __m128i __DEFAULT_FN_ATTRS
4188 _mm_packs_epi16(__m128i __a, __m128i __b)
4189 {
4190  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4191 }
4192 
4193 /// Converts 32-bit signed integers from both 128-bit integer vector
4194 /// operands into 16-bit signed integers, and packs the results into the
4195 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4196 /// Negative values less than 0x8000 are saturated to 0x8000.
4197 ///
4198 /// \headerfile <x86intrin.h>
4199 ///
4200 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4201 ///
4202 /// \param __a
4203 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4204 /// a signed integer and is converted to a 16-bit signed integer with
4205 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4206 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4207 /// are written to the lower 64 bits of the result.
4208 /// \param __b
4209 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4210 /// a signed integer and is converted to a 16-bit signed integer with
4211 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4212 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4213 /// are written to the higher 64 bits of the result.
4214 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4215 static __inline__ __m128i __DEFAULT_FN_ATTRS
4216 _mm_packs_epi32(__m128i __a, __m128i __b)
4217 {
4218  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4219 }
4220 
4221 /// Converts 16-bit signed integers from both 128-bit integer vector
4222 /// operands into 8-bit unsigned integers, and packs the results into the
4223 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4224 /// than 0x00 are saturated to 0x00.
4225 ///
4226 /// \headerfile <x86intrin.h>
4227 ///
4228 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4229 ///
4230 /// \param __a
4231 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4232 /// a signed integer and is converted to an 8-bit unsigned integer with
4233 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4234 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4235 /// written to the lower 64 bits of the result.
4236 /// \param __b
4237 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4238 /// a signed integer and is converted to an 8-bit unsigned integer with
4239 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4240 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4241 /// written to the higher 64 bits of the result.
4242 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4243 static __inline__ __m128i __DEFAULT_FN_ATTRS
4244 _mm_packus_epi16(__m128i __a, __m128i __b)
4245 {
4246  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4247 }
4248 
4249 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4250 /// the immediate-value parameter as a selector.
4251 ///
4252 /// \headerfile <x86intrin.h>
4253 ///
4254 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4255 ///
4256 /// \param __a
4257 /// A 128-bit integer vector.
4258 /// \param __imm
4259 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned
4260 /// to bits[15:0] of the result. \n
4261 /// 000: assign values from bits [15:0] of \a __a. \n
4262 /// 001: assign values from bits [31:16] of \a __a. \n
4263 /// 010: assign values from bits [47:32] of \a __a. \n
4264 /// 011: assign values from bits [63:48] of \a __a. \n
4265 /// 100: assign values from bits [79:64] of \a __a. \n
4266 /// 101: assign values from bits [95:80] of \a __a. \n
4267 /// 110: assign values from bits [111:96] of \a __a. \n
4268 /// 111: assign values from bits [127:112] of \a __a.
4269 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4270 /// integer vector parameter and the remaining bits are assigned zeros.
4271 #define _mm_extract_epi16(a, imm) \
4272  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4273  (int)(imm))
4274 
4275 /// Constructs a 128-bit integer vector by first making a copy of the
4276 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4277 /// of an integer parameter into an offset specified by the immediate-value
4278 /// parameter.
4279 ///
4280 /// \headerfile <x86intrin.h>
4281 ///
4282 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4283 ///
4284 /// \param __a
4285 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4286 /// result and then one of the eight elements in the result is replaced by
4287 /// the lower 16 bits of \a __b.
4288 /// \param __b
4289 /// An integer. The lower 16 bits of this parameter are written to the
4290 /// result beginning at an offset specified by \a __imm.
4291 /// \param __imm
4292 /// An immediate value specifying the bit offset in the result at which the
4293 /// lower 16 bits of \a __b are written.
4294 /// \returns A 128-bit integer vector containing the constructed values.
4295 #define _mm_insert_epi16(a, b, imm) \
4296  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4297  (int)(imm))
4298 
4299 /// Copies the values of the most significant bits from each 8-bit
4300 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4301 /// value, zero-extends the value, and writes it to the destination.
4302 ///
4303 /// \headerfile <x86intrin.h>
4304 ///
4305 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4306 ///
4307 /// \param __a
4308 /// A 128-bit integer vector containing the values with bits to be extracted.
4309 /// \returns The most significant bits from each 8-bit element in \a __a,
4310 /// written to bits [15:0]. The other bits are assigned zeros.
4311 static __inline__ int __DEFAULT_FN_ATTRS
4312 _mm_movemask_epi8(__m128i __a)
4313 {
4314  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4315 }
4316 
4317 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4318 /// elements of a 128-bit integer vector parameter, using the immediate-value
4319 /// parameter as a specifier.
4320 ///
4321 /// \headerfile <x86intrin.h>
4322 ///
4323 /// \code
4324 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4325 /// \endcode
4326 ///
4327 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4328 ///
4329 /// \param a
4330 /// A 128-bit integer vector containing the values to be copied.
4331 /// \param imm
4332 /// An immediate value containing an 8-bit value specifying which elements to
4333 /// copy from a. The destinations within the 128-bit destination are assigned
4334 /// values as follows: \n
4335 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4336 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4337 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4338 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4339 /// Bit value assignments: \n
4340 /// 00: assign values from bits [31:0] of \a a. \n
4341 /// 01: assign values from bits [63:32] of \a a. \n
4342 /// 10: assign values from bits [95:64] of \a a. \n
4343 /// 11: assign values from bits [127:96] of \a a.
4344 /// \returns A 128-bit integer vector containing the shuffled values.
4345 #define _mm_shuffle_epi32(a, imm) \
4346  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
4347 
4348 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4349 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4350 /// value parameter as a specifier.
4351 ///
4352 /// \headerfile <x86intrin.h>
4353 ///
4354 /// \code
4355 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4356 /// \endcode
4357 ///
4358 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4359 ///
4360 /// \param a
4361 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4362 /// [127:64] of the result.
4363 /// \param imm
4364 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4365 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4366 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4367 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4368 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4369 /// Bit value assignments: \n
4370 /// 00: assign values from bits [15:0] of \a a. \n
4371 /// 01: assign values from bits [31:16] of \a a. \n
4372 /// 10: assign values from bits [47:32] of \a a. \n
4373 /// 11: assign values from bits [63:48] of \a a. \n
4374 /// \returns A 128-bit integer vector containing the shuffled values.
4375 #define _mm_shufflelo_epi16(a, imm) \
4376  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
4377 
4378 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4379 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4380 /// value parameter as a specifier.
4381 ///
4382 /// \headerfile <x86intrin.h>
4383 ///
4384 /// \code
4385 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4386 /// \endcode
4387 ///
4388 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4389 ///
4390 /// \param a
4391 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4392 /// [63:0] of the result.
4393 /// \param imm
4394 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4395 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4396 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4397 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4398 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4399 /// Bit value assignments: \n
4400 /// 00: assign values from bits [79:64] of \a a. \n
4401 /// 01: assign values from bits [95:80] of \a a. \n
4402 /// 10: assign values from bits [111:96] of \a a. \n
4403 /// 11: assign values from bits [127:112] of \a a. \n
4404 /// \returns A 128-bit integer vector containing the shuffled values.
4405 #define _mm_shufflehi_epi16(a, imm) \
4406  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
4407 
4408 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4409 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4410 ///
4411 /// \headerfile <x86intrin.h>
4412 ///
4413 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4414 /// instruction.
4415 ///
4416 /// \param __a
4417 /// A 128-bit vector of [16 x i8].
4418 /// Bits [71:64] are written to bits [7:0] of the result. \n
4419 /// Bits [79:72] are written to bits [23:16] of the result. \n
4420 /// Bits [87:80] are written to bits [39:32] of the result. \n
4421 /// Bits [95:88] are written to bits [55:48] of the result. \n
4422 /// Bits [103:96] are written to bits [71:64] of the result. \n
4423 /// Bits [111:104] are written to bits [87:80] of the result. \n
4424 /// Bits [119:112] are written to bits [103:96] of the result. \n
4425 /// Bits [127:120] are written to bits [119:112] of the result.
4426 /// \param __b
4427 /// A 128-bit vector of [16 x i8]. \n
4428 /// Bits [71:64] are written to bits [15:8] of the result. \n
4429 /// Bits [79:72] are written to bits [31:24] of the result. \n
4430 /// Bits [87:80] are written to bits [47:40] of the result. \n
4431 /// Bits [95:88] are written to bits [63:56] of the result. \n
4432 /// Bits [103:96] are written to bits [79:72] of the result. \n
4433 /// Bits [111:104] are written to bits [95:88] of the result. \n
4434 /// Bits [119:112] are written to bits [111:104] of the result. \n
4435 /// Bits [127:120] are written to bits [127:120] of the result.
4436 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4437 static __inline__ __m128i __DEFAULT_FN_ATTRS
4438 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
4439 {
4440  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4441 }
4442 
4443 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4444 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4445 ///
4446 /// \headerfile <x86intrin.h>
4447 ///
4448 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4449 /// instruction.
4450 ///
4451 /// \param __a
4452 /// A 128-bit vector of [8 x i16].
4453 /// Bits [79:64] are written to bits [15:0] of the result. \n
4454 /// Bits [95:80] are written to bits [47:32] of the result. \n
4455 /// Bits [111:96] are written to bits [79:64] of the result. \n
4456 /// Bits [127:112] are written to bits [111:96] of the result.
4457 /// \param __b
4458 /// A 128-bit vector of [8 x i16].
4459 /// Bits [79:64] are written to bits [31:16] of the result. \n
4460 /// Bits [95:80] are written to bits [63:48] of the result. \n
4461 /// Bits [111:96] are written to bits [95:80] of the result. \n
4462 /// Bits [127:112] are written to bits [127:112] of the result.
4463 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4464 static __inline__ __m128i __DEFAULT_FN_ATTRS
4465 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
4466 {
4467  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4468 }
4469 
4470 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4471 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4472 ///
4473 /// \headerfile <x86intrin.h>
4474 ///
4475 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4476 /// instruction.
4477 ///
4478 /// \param __a
4479 /// A 128-bit vector of [4 x i32]. \n
4480 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4481 /// Bits [127:96] are written to bits [95:64] of the destination.
4482 /// \param __b
4483 /// A 128-bit vector of [4 x i32]. \n
4484 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4485 /// Bits [127:96] are written to bits [127:96] of the destination.
4486 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4487 static __inline__ __m128i __DEFAULT_FN_ATTRS
4488 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
4489 {
4490  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4491 }
4492 
4493 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4494 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4495 ///
4496 /// \headerfile <x86intrin.h>
4497 ///
4498 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4499 /// instruction.
4500 ///
4501 /// \param __a
4502 /// A 128-bit vector of [2 x i64]. \n
4503 /// Bits [127:64] are written to bits [63:0] of the destination.
4504 /// \param __b
4505 /// A 128-bit vector of [2 x i64]. \n
4506 /// Bits [127:64] are written to bits [127:64] of the destination.
4507 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4508 static __inline__ __m128i __DEFAULT_FN_ATTRS
4509 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
4510 {
4511  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4512 }
4513 
4514 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4515 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4516 ///
4517 /// \headerfile <x86intrin.h>
4518 ///
4519 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4520 /// instruction.
4521 ///
4522 /// \param __a
4523 /// A 128-bit vector of [16 x i8]. \n
4524 /// Bits [7:0] are written to bits [7:0] of the result. \n
4525 /// Bits [15:8] are written to bits [23:16] of the result. \n
4526 /// Bits [23:16] are written to bits [39:32] of the result. \n
4527 /// Bits [31:24] are written to bits [55:48] of the result. \n
4528 /// Bits [39:32] are written to bits [71:64] of the result. \n
4529 /// Bits [47:40] are written to bits [87:80] of the result. \n
4530 /// Bits [55:48] are written to bits [103:96] of the result. \n
4531 /// Bits [63:56] are written to bits [119:112] of the result.
4532 /// \param __b
4533 /// A 128-bit vector of [16 x i8].
4534 /// Bits [7:0] are written to bits [15:8] of the result. \n
4535 /// Bits [15:8] are written to bits [31:24] of the result. \n
4536 /// Bits [23:16] are written to bits [47:40] of the result. \n
4537 /// Bits [31:24] are written to bits [63:56] of the result. \n
4538 /// Bits [39:32] are written to bits [79:72] of the result. \n
4539 /// Bits [47:40] are written to bits [95:88] of the result. \n
4540 /// Bits [55:48] are written to bits [111:104] of the result. \n
4541 /// Bits [63:56] are written to bits [127:120] of the result.
4542 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4543 static __inline__ __m128i __DEFAULT_FN_ATTRS
4544 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
4545 {
4546  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4547 }
4548 
4549 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4550 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4551 /// [8 x i16].
4552 ///
4553 /// \headerfile <x86intrin.h>
4554 ///
4555 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4556 /// instruction.
4557 ///
4558 /// \param __a
4559 /// A 128-bit vector of [8 x i16].
4560 /// Bits [15:0] are written to bits [15:0] of the result. \n
4561 /// Bits [31:16] are written to bits [47:32] of the result. \n
4562 /// Bits [47:32] are written to bits [79:64] of the result. \n
4563 /// Bits [63:48] are written to bits [111:96] of the result.
4564 /// \param __b
4565 /// A 128-bit vector of [8 x i16].
4566 /// Bits [15:0] are written to bits [31:16] of the result. \n
4567 /// Bits [31:16] are written to bits [63:48] of the result. \n
4568 /// Bits [47:32] are written to bits [95:80] of the result. \n
4569 /// Bits [63:48] are written to bits [127:112] of the result.
4570 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4571 static __inline__ __m128i __DEFAULT_FN_ATTRS
4572 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
4573 {
4574  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4575 }
4576 
4577 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4578 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4579 ///
4580 /// \headerfile <x86intrin.h>
4581 ///
4582 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4583 /// instruction.
4584 ///
4585 /// \param __a
4586 /// A 128-bit vector of [4 x i32]. \n
4587 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4588 /// Bits [63:32] are written to bits [95:64] of the destination.
4589 /// \param __b
4590 /// A 128-bit vector of [4 x i32]. \n
4591 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4592 /// Bits [63:32] are written to bits [127:96] of the destination.
4593 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4594 static __inline__ __m128i __DEFAULT_FN_ATTRS
4595 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
4596 {
4597  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4598 }
4599 
4600 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4601 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4602 ///
4603 /// \headerfile <x86intrin.h>
4604 ///
4605 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4606 /// instruction.
4607 ///
4608 /// \param __a
4609 /// A 128-bit vector of [2 x i64]. \n
4610 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4611 /// \param __b
4612 /// A 128-bit vector of [2 x i64]. \n
4613 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4614 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4615 static __inline__ __m128i __DEFAULT_FN_ATTRS
4616 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
4617 {
4618  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4619 }
4620 
4621 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4622 /// integer.
4623 ///
4624 /// \headerfile <x86intrin.h>
4625 ///
4626 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4627 ///
4628 /// \param __a
4629 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4630 /// destination.
4631 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4632 static __inline__ __m64 __DEFAULT_FN_ATTRS
4633 _mm_movepi64_pi64(__m128i __a)
4634 {
4635  return (__m64)__a[0];
4636 }
4637 
4638 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4639 /// upper bits.
4640 ///
4641 /// \headerfile <x86intrin.h>
4642 ///
4643 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4644 ///
4645 /// \param __a
4646 /// A 64-bit value.
4647 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4648 /// the operand. The upper 64 bits are assigned zeros.
4649 static __inline__ __m128i __DEFAULT_FN_ATTRS
4651 {
4652  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4653 }
4654 
4655 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4656 /// integer vector, zeroing the upper bits.
4657 ///
4658 /// \headerfile <x86intrin.h>
4659 ///
4660 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4661 ///
4662 /// \param __a
4663 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4664 /// destination.
4665 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4666 /// the operand. The upper 64 bits are assigned zeros.
4667 static __inline__ __m128i __DEFAULT_FN_ATTRS
4668 _mm_move_epi64(__m128i __a)
4669 {
4670  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4671 }
4672 
4673 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4674 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4675 /// double].
4676 ///
4677 /// \headerfile <x86intrin.h>
4678 ///
4679 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4680 ///
4681 /// \param __a
4682 /// A 128-bit vector of [2 x double]. \n
4683 /// Bits [127:64] are written to bits [63:0] of the destination.
4684 /// \param __b
4685 /// A 128-bit vector of [2 x double]. \n
4686 /// Bits [127:64] are written to bits [127:64] of the destination.
4687 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4688 static __inline__ __m128d __DEFAULT_FN_ATTRS
4689 _mm_unpackhi_pd(__m128d __a, __m128d __b)
4690 {
4691  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4692 }
4693 
4694 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4695 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4696 /// double].
4697 ///
4698 /// \headerfile <x86intrin.h>
4699 ///
4700 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4701 ///
4702 /// \param __a
4703 /// A 128-bit vector of [2 x double]. \n
4704 /// Bits [63:0] are written to bits [63:0] of the destination.
4705 /// \param __b
4706 /// A 128-bit vector of [2 x double]. \n
4707 /// Bits [63:0] are written to bits [127:64] of the destination.
4708 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4709 static __inline__ __m128d __DEFAULT_FN_ATTRS
4710 _mm_unpacklo_pd(__m128d __a, __m128d __b)
4711 {
4712  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4713 }
4714 
4715 /// Extracts the sign bits of the double-precision values in the 128-bit
4716 /// vector of [2 x double], zero-extends the value, and writes it to the
4717 /// low-order bits of the destination.
4718 ///
4719 /// \headerfile <x86intrin.h>
4720 ///
4721 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4722 ///
4723 /// \param __a
4724 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4725 /// be extracted.
4726 /// \returns The sign bits from each of the double-precision elements in \a __a,
4727 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4728 static __inline__ int __DEFAULT_FN_ATTRS
4729 _mm_movemask_pd(__m128d __a)
4730 {
4731  return __builtin_ia32_movmskpd((__v2df)__a);
4732 }
4733 
4734 
4735 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4736 /// 128-bit vector parameters of [2 x double], using the immediate-value
4737 /// parameter as a specifier.
4738 ///
4739 /// \headerfile <x86intrin.h>
4740 ///
4741 /// \code
4742 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4743 /// \endcode
4744 ///
4745 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4746 ///
4747 /// \param a
4748 /// A 128-bit vector of [2 x double].
4749 /// \param b
4750 /// A 128-bit vector of [2 x double].
4751 /// \param i
4752 /// An 8-bit immediate value. The least significant two bits specify which
4753 /// elements to copy from \a a and \a b: \n
4754 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4755 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4756 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4757 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4758 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4759 #define _mm_shuffle_pd(a, b, i) \
4760  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4761  (int)(i))
4762 
4763 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4764 /// floating-point vector of [4 x float].
4765 ///
4766 /// \headerfile <x86intrin.h>
4767 ///
4768 /// This intrinsic has no corresponding instruction.
4769 ///
4770 /// \param __a
4771 /// A 128-bit floating-point vector of [2 x double].
4772 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4773 /// bitwise pattern as the parameter.
4774 static __inline__ __m128 __DEFAULT_FN_ATTRS
4775 _mm_castpd_ps(__m128d __a)
4776 {
4777  return (__m128)__a;
4778 }
4779 
4780 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4781 /// integer vector.
4782 ///
4783 /// \headerfile <x86intrin.h>
4784 ///
4785 /// This intrinsic has no corresponding instruction.
4786 ///
4787 /// \param __a
4788 /// A 128-bit floating-point vector of [2 x double].
4789 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4790 /// parameter.
4791 static __inline__ __m128i __DEFAULT_FN_ATTRS
4792 _mm_castpd_si128(__m128d __a)
4793 {
4794  return (__m128i)__a;
4795 }
4796 
4797 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4798 /// floating-point vector of [2 x double].
4799 ///
4800 /// \headerfile <x86intrin.h>
4801 ///
4802 /// This intrinsic has no corresponding instruction.
4803 ///
4804 /// \param __a
4805 /// A 128-bit floating-point vector of [4 x float].
4806 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4807 /// bitwise pattern as the parameter.
4808 static __inline__ __m128d __DEFAULT_FN_ATTRS
4809 _mm_castps_pd(__m128 __a)
4810 {
4811  return (__m128d)__a;
4812 }
4813 
4814 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4815 /// integer vector.
4816 ///
4817 /// \headerfile <x86intrin.h>
4818 ///
4819 /// This intrinsic has no corresponding instruction.
4820 ///
4821 /// \param __a
4822 /// A 128-bit floating-point vector of [4 x float].
4823 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4824 /// parameter.
4825 static __inline__ __m128i __DEFAULT_FN_ATTRS
4826 _mm_castps_si128(__m128 __a)
4827 {
4828  return (__m128i)__a;
4829 }
4830 
4831 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4832 /// of [4 x float].
4833 ///
4834 /// \headerfile <x86intrin.h>
4835 ///
4836 /// This intrinsic has no corresponding instruction.
4837 ///
4838 /// \param __a
4839 /// A 128-bit integer vector.
4840 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4841 /// bitwise pattern as the parameter.
4842 static __inline__ __m128 __DEFAULT_FN_ATTRS
4843 _mm_castsi128_ps(__m128i __a)
4844 {
4845  return (__m128)__a;
4846 }
4847 
4848 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4849 /// of [2 x double].
4850 ///
4851 /// \headerfile <x86intrin.h>
4852 ///
4853 /// This intrinsic has no corresponding instruction.
4854 ///
4855 /// \param __a
4856 /// A 128-bit integer vector.
4857 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4858 /// bitwise pattern as the parameter.
4859 static __inline__ __m128d __DEFAULT_FN_ATTRS
4860 _mm_castsi128_pd(__m128i __a)
4861 {
4862  return (__m128d)__a;
4863 }
4864 
4865 #if defined(__cplusplus)
4866 extern "C" {
4867 #endif
4868 
4869 /// Indicates that a spin loop is being executed for the purposes of
4870 /// optimizing power consumption during the loop.
4871 ///
4872 /// \headerfile <x86intrin.h>
4873 ///
4874 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4875 ///
4876 void _mm_pause(void);
4877 
4878 #if defined(__cplusplus)
4879 } // extern "C"
4880 #endif
4881 #undef __DEFAULT_FN_ATTRS
4882 #undef __DEFAULT_FN_ATTRS_MMX
4883 
4884 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4885 
4886 #define _MM_DENORMALS_ZERO_ON (0x0040)
4887 #define _MM_DENORMALS_ZERO_OFF (0x0000)
4888 
4889 #define _MM_DENORMALS_ZERO_MASK (0x0040)
4890 
4891 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4892 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4893 
4894 #endif /* __EMMINTRIN_H */
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:499
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2854
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3033
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1386
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1261
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3675
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double]...
Definition: emmintrin.h:255
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3939
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2873
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2931
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1209
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1484
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1349
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value...
Definition: emmintrin.h:3761
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3071
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3861
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4216
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value...
Definition: emmintrin.h:3742
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1788
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1055
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2060
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4826
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1806
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2129
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2816
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:770
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1107
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1672
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2619
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:323
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3109
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1235
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4710
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1306
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2546
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1369
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2487
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1747
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2701
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4488
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3166
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3892
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:651
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location...
Definition: emmintrin.h:4017
double __m128d __attribute__((__vector_size__(16)))
Definition: emmintrin.h:29
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3291
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3607
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand...
Definition: emmintrin.h:3545
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1287
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:279
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:67
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3818
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:796
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4650
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4633
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1556
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1652
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2911
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1608
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4775
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3312
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:695
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3524
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3585
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:38
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors...
Definition: emmintrin.h:2212
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location...
Definition: emmintrin.h:1940
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1183
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers...
Definition: emmintrin.h:4188
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2971
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2582
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1539
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1693
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3249
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1865
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3437
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:672
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3508
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2022
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1880
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:128
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4809
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:192
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1411
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1326
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2169
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1824
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:720
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:478
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2601
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3421
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1844
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2892
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1157
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2388
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1590
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2951
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:952
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:86
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:824
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:978
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2085
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors...
Definition: emmintrin.h:2190
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3185
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4668
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:400
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1918
static __inline__ vector float vector float __b
Definition: altivec.h:534
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:927
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3205
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:609
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4616
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:237
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3635
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3228
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:36
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2107
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4689
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4078
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2408
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1999
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1573
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:150
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4036
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2991
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:109
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:361
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4792
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2468
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value...
Definition: emmintrin.h:1502
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2719
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value...
Definition: emmintrin.h:3472
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3723
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3090
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4595
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:902
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4860
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:299
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2348
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2368
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:543
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:418
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3838
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2254
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4544
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2040
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1434
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3953
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2506
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one&#39;s complement of the valu...
Definition: emmintrin.h:382
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:745
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2328
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value...
Definition: emmintrin.h:3780
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1519
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4572
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2661
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1003
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2428
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4244
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2681
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3971
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3270
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3563
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors...
Definition: emmintrin.h:2528
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1081
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2448
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1029
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4438
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location...
Definition: emmintrin.h:4059
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value...
Definition: emmintrin.h:3799
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1901
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:852
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:520
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:877
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:457
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3052
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2835
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1720
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:588
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:48
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3404
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2564
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:437
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:343
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float]...
Definition: emmintrin.h:1460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2774
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double]...
Definition: emmintrin.h:4729
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2640
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum...
Definition: emmintrin.h:2147
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2756
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3987
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si32(int *__p, int __a)
Stores a 32-bit integer value in the specified memory location.
Definition: emmintrin.h:4097
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4312
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:169
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1634
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1960
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3388
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:49
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2274
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4199
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2233
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1768
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1981
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:630
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one&#39;s complement of the values conta...
Definition: emmintrin.h:2739
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3147
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4465
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:567