clang  9.0.0svn
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __EMMINTRIN_H
25 #define __EMMINTRIN_H
26 
27 #include <xmmintrin.h>
28 
29 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
30 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
31 
32 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
33 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
34 
35 /* Type defines. */
36 typedef double __v2df __attribute__ ((__vector_size__ (16)));
37 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
38 typedef short __v8hi __attribute__((__vector_size__(16)));
39 typedef char __v16qi __attribute__((__vector_size__(16)));
40 
41 /* Unsigned types */
42 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
43 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
44 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
45 
46 /* We need an explicitly signed variant for char. Note that this shouldn't
47  * appear in the interface though. */
48 typedef signed char __v16qs __attribute__((__vector_size__(16)));
49 
50 /* Define the default attributes for the functions in this file. */
51 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
52 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
53 
54 /// Adds lower double-precision values in both operands and returns the
55 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
56 /// are copied from the upper double-precision value of the first operand.
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
61 ///
62 /// \param __a
63 /// A 128-bit vector of [2 x double] containing one of the source operands.
64 /// \param __b
65 /// A 128-bit vector of [2 x double] containing one of the source operands.
66 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
67 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
68 /// from the upper 64 bits of the first source operand.
69 static __inline__ __m128d __DEFAULT_FN_ATTRS
70 _mm_add_sd(__m128d __a, __m128d __b)
71 {
72  __a[0] += __b[0];
73  return __a;
74 }
75 
76 /// Adds two 128-bit vectors of [2 x double].
77 ///
78 /// \headerfile <x86intrin.h>
79 ///
80 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
81 ///
82 /// \param __a
83 /// A 128-bit vector of [2 x double] containing one of the source operands.
84 /// \param __b
85 /// A 128-bit vector of [2 x double] containing one of the source operands.
86 /// \returns A 128-bit vector of [2 x double] containing the sums of both
87 /// operands.
88 static __inline__ __m128d __DEFAULT_FN_ATTRS
89 _mm_add_pd(__m128d __a, __m128d __b)
90 {
91  return (__m128d)((__v2df)__a + (__v2df)__b);
92 }
93 
94 /// Subtracts the lower double-precision value of the second operand
95 /// from the lower double-precision value of the first operand and returns
96 /// the difference in the lower 64 bits of the result. The upper 64 bits of
97 /// the result are copied from the upper double-precision value of the first
98 /// operand.
99 ///
100 /// \headerfile <x86intrin.h>
101 ///
102 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
103 ///
104 /// \param __a
105 /// A 128-bit vector of [2 x double] containing the minuend.
106 /// \param __b
107 /// A 128-bit vector of [2 x double] containing the subtrahend.
108 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
109 /// difference of the lower 64 bits of both operands. The upper 64 bits are
110 /// copied from the upper 64 bits of the first source operand.
111 static __inline__ __m128d __DEFAULT_FN_ATTRS
112 _mm_sub_sd(__m128d __a, __m128d __b)
113 {
114  __a[0] -= __b[0];
115  return __a;
116 }
117 
118 /// Subtracts two 128-bit vectors of [2 x double].
119 ///
120 /// \headerfile <x86intrin.h>
121 ///
122 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
123 ///
124 /// \param __a
125 /// A 128-bit vector of [2 x double] containing the minuend.
126 /// \param __b
127 /// A 128-bit vector of [2 x double] containing the subtrahend.
128 /// \returns A 128-bit vector of [2 x double] containing the differences between
129 /// both operands.
130 static __inline__ __m128d __DEFAULT_FN_ATTRS
131 _mm_sub_pd(__m128d __a, __m128d __b)
132 {
133  return (__m128d)((__v2df)__a - (__v2df)__b);
134 }
135 
136 /// Multiplies lower double-precision values in both operands and returns
137 /// the product in the lower 64 bits of the result. The upper 64 bits of the
138 /// result are copied from the upper double-precision value of the first
139 /// operand.
140 ///
141 /// \headerfile <x86intrin.h>
142 ///
143 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
144 ///
145 /// \param __a
146 /// A 128-bit vector of [2 x double] containing one of the source operands.
147 /// \param __b
148 /// A 128-bit vector of [2 x double] containing one of the source operands.
149 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
150 /// product of the lower 64 bits of both operands. The upper 64 bits are
151 /// copied from the upper 64 bits of the first source operand.
152 static __inline__ __m128d __DEFAULT_FN_ATTRS
153 _mm_mul_sd(__m128d __a, __m128d __b)
154 {
155  __a[0] *= __b[0];
156  return __a;
157 }
158 
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 /// operands.
171 static __inline__ __m128d __DEFAULT_FN_ATTRS
172 _mm_mul_pd(__m128d __a, __m128d __b)
173 {
174  return (__m128d)((__v2df)__a * (__v2df)__b);
175 }
176 
177 /// Divides the lower double-precision value of the first operand by the
178 /// lower double-precision value of the second operand and returns the
179 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
180 /// result are copied from the upper double-precision value of the first
181 /// operand.
182 ///
183 /// \headerfile <x86intrin.h>
184 ///
185 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
186 ///
187 /// \param __a
188 /// A 128-bit vector of [2 x double] containing the dividend.
189 /// \param __b
190 /// A 128-bit vector of [2 x double] containing divisor.
191 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
192 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
193 /// copied from the upper 64 bits of the first source operand.
194 static __inline__ __m128d __DEFAULT_FN_ATTRS
195 _mm_div_sd(__m128d __a, __m128d __b)
196 {
197  __a[0] /= __b[0];
198  return __a;
199 }
200 
201 /// Performs an element-by-element division of two 128-bit vectors of
202 /// [2 x double].
203 ///
204 /// \headerfile <x86intrin.h>
205 ///
206 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
207 ///
208 /// \param __a
209 /// A 128-bit vector of [2 x double] containing the dividend.
210 /// \param __b
211 /// A 128-bit vector of [2 x double] containing the divisor.
212 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
213 /// operands.
214 static __inline__ __m128d __DEFAULT_FN_ATTRS
215 _mm_div_pd(__m128d __a, __m128d __b)
216 {
217  return (__m128d)((__v2df)__a / (__v2df)__b);
218 }
219 
220 /// Calculates the square root of the lower double-precision value of
221 /// the second operand and returns it in the lower 64 bits of the result.
222 /// The upper 64 bits of the result are copied from the upper
223 /// double-precision value of the first operand.
224 ///
225 /// \headerfile <x86intrin.h>
226 ///
227 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
228 ///
229 /// \param __a
230 /// A 128-bit vector of [2 x double] containing one of the operands. The
231 /// upper 64 bits of this operand are copied to the upper 64 bits of the
232 /// result.
233 /// \param __b
234 /// A 128-bit vector of [2 x double] containing one of the operands. The
235 /// square root is calculated using the lower 64 bits of this operand.
236 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
237 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
238 /// bits are copied from the upper 64 bits of operand \a __a.
239 static __inline__ __m128d __DEFAULT_FN_ATTRS
240 _mm_sqrt_sd(__m128d __a, __m128d __b)
241 {
242  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
243  return __extension__ (__m128d) { __c[0], __a[1] };
244 }
245 
246 /// Calculates the square root of the each of two values stored in a
247 /// 128-bit vector of [2 x double].
248 ///
249 /// \headerfile <x86intrin.h>
250 ///
251 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
252 ///
253 /// \param __a
254 /// A 128-bit vector of [2 x double].
255 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
256 /// values in the operand.
257 static __inline__ __m128d __DEFAULT_FN_ATTRS
258 _mm_sqrt_pd(__m128d __a)
259 {
260  return __builtin_ia32_sqrtpd((__v2df)__a);
261 }
262 
263 /// Compares lower 64-bit double-precision values of both operands, and
264 /// returns the lesser of the pair of values in the lower 64-bits of the
265 /// result. The upper 64 bits of the result are copied from the upper
266 /// double-precision value of the first operand.
267 ///
268 /// \headerfile <x86intrin.h>
269 ///
270 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
271 ///
272 /// \param __a
273 /// A 128-bit vector of [2 x double] containing one of the operands. The
274 /// lower 64 bits of this operand are used in the comparison.
275 /// \param __b
276 /// A 128-bit vector of [2 x double] containing one of the operands. The
277 /// lower 64 bits of this operand are used in the comparison.
278 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
279 /// minimum value between both operands. The upper 64 bits are copied from
280 /// the upper 64 bits of the first source operand.
281 static __inline__ __m128d __DEFAULT_FN_ATTRS
282 _mm_min_sd(__m128d __a, __m128d __b)
283 {
284  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
285 }
286 
287 /// Performs element-by-element comparison of the two 128-bit vectors of
288 /// [2 x double] and returns the vector containing the lesser of each pair of
289 /// values.
290 ///
291 /// \headerfile <x86intrin.h>
292 ///
293 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
294 ///
295 /// \param __a
296 /// A 128-bit vector of [2 x double] containing one of the operands.
297 /// \param __b
298 /// A 128-bit vector of [2 x double] containing one of the operands.
299 /// \returns A 128-bit vector of [2 x double] containing the minimum values
300 /// between both operands.
301 static __inline__ __m128d __DEFAULT_FN_ATTRS
302 _mm_min_pd(__m128d __a, __m128d __b)
303 {
304  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
305 }
306 
307 /// Compares lower 64-bit double-precision values of both operands, and
308 /// returns the greater of the pair of values in the lower 64-bits of the
309 /// result. The upper 64 bits of the result are copied from the upper
310 /// double-precision value of the first operand.
311 ///
312 /// \headerfile <x86intrin.h>
313 ///
314 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
315 ///
316 /// \param __a
317 /// A 128-bit vector of [2 x double] containing one of the operands. The
318 /// lower 64 bits of this operand are used in the comparison.
319 /// \param __b
320 /// A 128-bit vector of [2 x double] containing one of the operands. The
321 /// lower 64 bits of this operand are used in the comparison.
322 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
323 /// maximum value between both operands. The upper 64 bits are copied from
324 /// the upper 64 bits of the first source operand.
325 static __inline__ __m128d __DEFAULT_FN_ATTRS
326 _mm_max_sd(__m128d __a, __m128d __b)
327 {
328  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
329 }
330 
331 /// Performs element-by-element comparison of the two 128-bit vectors of
332 /// [2 x double] and returns the vector containing the greater of each pair
333 /// of values.
334 ///
335 /// \headerfile <x86intrin.h>
336 ///
337 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
338 ///
339 /// \param __a
340 /// A 128-bit vector of [2 x double] containing one of the operands.
341 /// \param __b
342 /// A 128-bit vector of [2 x double] containing one of the operands.
343 /// \returns A 128-bit vector of [2 x double] containing the maximum values
344 /// between both operands.
345 static __inline__ __m128d __DEFAULT_FN_ATTRS
346 _mm_max_pd(__m128d __a, __m128d __b)
347 {
348  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
349 }
350 
351 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
352 ///
353 /// \headerfile <x86intrin.h>
354 ///
355 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
356 ///
357 /// \param __a
358 /// A 128-bit vector of [2 x double] containing one of the source operands.
359 /// \param __b
360 /// A 128-bit vector of [2 x double] containing one of the source operands.
361 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
362 /// values between both operands.
363 static __inline__ __m128d __DEFAULT_FN_ATTRS
364 _mm_and_pd(__m128d __a, __m128d __b)
365 {
366  return (__m128d)((__v2du)__a & (__v2du)__b);
367 }
368 
369 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
370 /// the one's complement of the values contained in the first source operand.
371 ///
372 /// \headerfile <x86intrin.h>
373 ///
374 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
375 ///
376 /// \param __a
377 /// A 128-bit vector of [2 x double] containing the left source operand. The
378 /// one's complement of this value is used in the bitwise AND.
379 /// \param __b
380 /// A 128-bit vector of [2 x double] containing the right source operand.
381 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
382 /// values in the second operand and the one's complement of the first
383 /// operand.
384 static __inline__ __m128d __DEFAULT_FN_ATTRS
385 _mm_andnot_pd(__m128d __a, __m128d __b)
386 {
387  return (__m128d)(~(__v2du)__a & (__v2du)__b);
388 }
389 
390 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
391 ///
392 /// \headerfile <x86intrin.h>
393 ///
394 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
395 ///
396 /// \param __a
397 /// A 128-bit vector of [2 x double] containing one of the source operands.
398 /// \param __b
399 /// A 128-bit vector of [2 x double] containing one of the source operands.
400 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
401 /// values between both operands.
402 static __inline__ __m128d __DEFAULT_FN_ATTRS
403 _mm_or_pd(__m128d __a, __m128d __b)
404 {
405  return (__m128d)((__v2du)__a | (__v2du)__b);
406 }
407 
408 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
409 ///
410 /// \headerfile <x86intrin.h>
411 ///
412 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
413 ///
414 /// \param __a
415 /// A 128-bit vector of [2 x double] containing one of the source operands.
416 /// \param __b
417 /// A 128-bit vector of [2 x double] containing one of the source operands.
418 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
419 /// values between both operands.
420 static __inline__ __m128d __DEFAULT_FN_ATTRS
421 _mm_xor_pd(__m128d __a, __m128d __b)
422 {
423  return (__m128d)((__v2du)__a ^ (__v2du)__b);
424 }
425 
426 /// Compares each of the corresponding double-precision values of the
427 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
428 /// for false, 0xFFFFFFFFFFFFFFFF for true.
429 ///
430 /// \headerfile <x86intrin.h>
431 ///
432 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
433 ///
434 /// \param __a
435 /// A 128-bit vector of [2 x double].
436 /// \param __b
437 /// A 128-bit vector of [2 x double].
438 /// \returns A 128-bit vector containing the comparison results.
439 static __inline__ __m128d __DEFAULT_FN_ATTRS
440 _mm_cmpeq_pd(__m128d __a, __m128d __b)
441 {
442  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
443 }
444 
445 /// Compares each of the corresponding double-precision values of the
446 /// 128-bit vectors of [2 x double] to determine if the values in the first
447 /// operand are less than those in the second operand. Each comparison
448 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
449 ///
450 /// \headerfile <x86intrin.h>
451 ///
452 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
453 ///
454 /// \param __a
455 /// A 128-bit vector of [2 x double].
456 /// \param __b
457 /// A 128-bit vector of [2 x double].
458 /// \returns A 128-bit vector containing the comparison results.
459 static __inline__ __m128d __DEFAULT_FN_ATTRS
460 _mm_cmplt_pd(__m128d __a, __m128d __b)
461 {
462  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
463 }
464 
465 /// Compares each of the corresponding double-precision values of the
466 /// 128-bit vectors of [2 x double] to determine if the values in the first
467 /// operand are less than or equal to those in the second operand.
468 ///
469 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
470 ///
471 /// \headerfile <x86intrin.h>
472 ///
473 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
474 ///
475 /// \param __a
476 /// A 128-bit vector of [2 x double].
477 /// \param __b
478 /// A 128-bit vector of [2 x double].
479 /// \returns A 128-bit vector containing the comparison results.
480 static __inline__ __m128d __DEFAULT_FN_ATTRS
481 _mm_cmple_pd(__m128d __a, __m128d __b)
482 {
483  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
484 }
485 
486 /// Compares each of the corresponding double-precision values of the
487 /// 128-bit vectors of [2 x double] to determine if the values in the first
488 /// operand are greater than those in the second operand.
489 ///
490 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
491 ///
492 /// \headerfile <x86intrin.h>
493 ///
494 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
495 ///
496 /// \param __a
497 /// A 128-bit vector of [2 x double].
498 /// \param __b
499 /// A 128-bit vector of [2 x double].
500 /// \returns A 128-bit vector containing the comparison results.
501 static __inline__ __m128d __DEFAULT_FN_ATTRS
502 _mm_cmpgt_pd(__m128d __a, __m128d __b)
503 {
504  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
505 }
506 
507 /// Compares each of the corresponding double-precision values of the
508 /// 128-bit vectors of [2 x double] to determine if the values in the first
509 /// operand are greater than or equal to those in the second operand.
510 ///
511 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
512 ///
513 /// \headerfile <x86intrin.h>
514 ///
515 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
516 ///
517 /// \param __a
518 /// A 128-bit vector of [2 x double].
519 /// \param __b
520 /// A 128-bit vector of [2 x double].
521 /// \returns A 128-bit vector containing the comparison results.
522 static __inline__ __m128d __DEFAULT_FN_ATTRS
523 _mm_cmpge_pd(__m128d __a, __m128d __b)
524 {
525  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
526 }
527 
528 /// Compares each of the corresponding double-precision values of the
529 /// 128-bit vectors of [2 x double] to determine if the values in the first
530 /// operand are ordered with respect to those in the second operand.
531 ///
532 /// A pair of double-precision values are "ordered" with respect to each
533 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
534 /// 0xFFFFFFFFFFFFFFFF for true.
535 ///
536 /// \headerfile <x86intrin.h>
537 ///
538 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
539 ///
540 /// \param __a
541 /// A 128-bit vector of [2 x double].
542 /// \param __b
543 /// A 128-bit vector of [2 x double].
544 /// \returns A 128-bit vector containing the comparison results.
545 static __inline__ __m128d __DEFAULT_FN_ATTRS
546 _mm_cmpord_pd(__m128d __a, __m128d __b)
547 {
548  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
549 }
550 
551 /// Compares each of the corresponding double-precision values of the
552 /// 128-bit vectors of [2 x double] to determine if the values in the first
553 /// operand are unordered with respect to those in the second operand.
554 ///
555 /// A pair of double-precision values are "unordered" with respect to each
556 /// other if one or both values are NaN. Each comparison yields 0x0 for
557 /// false, 0xFFFFFFFFFFFFFFFF for true.
558 ///
559 /// \headerfile <x86intrin.h>
560 ///
561 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
562 /// instruction.
563 ///
564 /// \param __a
565 /// A 128-bit vector of [2 x double].
566 /// \param __b
567 /// A 128-bit vector of [2 x double].
568 /// \returns A 128-bit vector containing the comparison results.
569 static __inline__ __m128d __DEFAULT_FN_ATTRS
570 _mm_cmpunord_pd(__m128d __a, __m128d __b)
571 {
572  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
573 }
574 
575 /// Compares each of the corresponding double-precision values of the
576 /// 128-bit vectors of [2 x double] to determine if the values in the first
577 /// operand are unequal to those in the second operand.
578 ///
579 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
580 ///
581 /// \headerfile <x86intrin.h>
582 ///
583 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
584 ///
585 /// \param __a
586 /// A 128-bit vector of [2 x double].
587 /// \param __b
588 /// A 128-bit vector of [2 x double].
589 /// \returns A 128-bit vector containing the comparison results.
590 static __inline__ __m128d __DEFAULT_FN_ATTRS
591 _mm_cmpneq_pd(__m128d __a, __m128d __b)
592 {
593  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
594 }
595 
596 /// Compares each of the corresponding double-precision values of the
597 /// 128-bit vectors of [2 x double] to determine if the values in the first
598 /// operand are not less than those in the second operand.
599 ///
600 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
601 ///
602 /// \headerfile <x86intrin.h>
603 ///
604 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
605 ///
606 /// \param __a
607 /// A 128-bit vector of [2 x double].
608 /// \param __b
609 /// A 128-bit vector of [2 x double].
610 /// \returns A 128-bit vector containing the comparison results.
611 static __inline__ __m128d __DEFAULT_FN_ATTRS
612 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
613 {
614  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
615 }
616 
617 /// Compares each of the corresponding double-precision values of the
618 /// 128-bit vectors of [2 x double] to determine if the values in the first
619 /// operand are not less than or equal to those in the second operand.
620 ///
621 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
622 ///
623 /// \headerfile <x86intrin.h>
624 ///
625 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
626 ///
627 /// \param __a
628 /// A 128-bit vector of [2 x double].
629 /// \param __b
630 /// A 128-bit vector of [2 x double].
631 /// \returns A 128-bit vector containing the comparison results.
632 static __inline__ __m128d __DEFAULT_FN_ATTRS
633 _mm_cmpnle_pd(__m128d __a, __m128d __b)
634 {
635  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
636 }
637 
638 /// Compares each of the corresponding double-precision values of the
639 /// 128-bit vectors of [2 x double] to determine if the values in the first
640 /// operand are not greater than those in the second operand.
641 ///
642 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
643 ///
644 /// \headerfile <x86intrin.h>
645 ///
646 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
647 ///
648 /// \param __a
649 /// A 128-bit vector of [2 x double].
650 /// \param __b
651 /// A 128-bit vector of [2 x double].
652 /// \returns A 128-bit vector containing the comparison results.
653 static __inline__ __m128d __DEFAULT_FN_ATTRS
654 _mm_cmpngt_pd(__m128d __a, __m128d __b)
655 {
656  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
657 }
658 
659 /// Compares each of the corresponding double-precision values of the
660 /// 128-bit vectors of [2 x double] to determine if the values in the first
661 /// operand are not greater than or equal to those in the second operand.
662 ///
663 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
664 ///
665 /// \headerfile <x86intrin.h>
666 ///
667 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
668 ///
669 /// \param __a
670 /// A 128-bit vector of [2 x double].
671 /// \param __b
672 /// A 128-bit vector of [2 x double].
673 /// \returns A 128-bit vector containing the comparison results.
674 static __inline__ __m128d __DEFAULT_FN_ATTRS
675 _mm_cmpnge_pd(__m128d __a, __m128d __b)
676 {
677  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
678 }
679 
680 /// Compares the lower double-precision floating-point values in each of
681 /// the two 128-bit floating-point vectors of [2 x double] for equality.
682 ///
683 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
684 ///
685 /// \headerfile <x86intrin.h>
686 ///
687 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
688 ///
689 /// \param __a
690 /// A 128-bit vector of [2 x double]. The lower double-precision value is
691 /// compared to the lower double-precision value of \a __b.
692 /// \param __b
693 /// A 128-bit vector of [2 x double]. The lower double-precision value is
694 /// compared to the lower double-precision value of \a __a.
695 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
696 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
697 static __inline__ __m128d __DEFAULT_FN_ATTRS
698 _mm_cmpeq_sd(__m128d __a, __m128d __b)
699 {
700  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
701 }
702 
703 /// Compares the lower double-precision floating-point values in each of
704 /// the two 128-bit floating-point vectors of [2 x double] to determine if
705 /// the value in the first parameter is less than the corresponding value in
706 /// the second parameter.
707 ///
708 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709 ///
710 /// \headerfile <x86intrin.h>
711 ///
712 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
713 ///
714 /// \param __a
715 /// A 128-bit vector of [2 x double]. The lower double-precision value is
716 /// compared to the lower double-precision value of \a __b.
717 /// \param __b
718 /// A 128-bit vector of [2 x double]. The lower double-precision value is
719 /// compared to the lower double-precision value of \a __a.
720 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
721 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
722 static __inline__ __m128d __DEFAULT_FN_ATTRS
723 _mm_cmplt_sd(__m128d __a, __m128d __b)
724 {
725  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
726 }
727 
728 /// Compares the lower double-precision floating-point values in each of
729 /// the two 128-bit floating-point vectors of [2 x double] to determine if
730 /// the value in the first parameter is less than or equal to the
731 /// corresponding value in the second parameter.
732 ///
733 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
734 ///
735 /// \headerfile <x86intrin.h>
736 ///
737 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
738 ///
739 /// \param __a
740 /// A 128-bit vector of [2 x double]. The lower double-precision value is
741 /// compared to the lower double-precision value of \a __b.
742 /// \param __b
743 /// A 128-bit vector of [2 x double]. The lower double-precision value is
744 /// compared to the lower double-precision value of \a __a.
745 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
746 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
747 static __inline__ __m128d __DEFAULT_FN_ATTRS
748 _mm_cmple_sd(__m128d __a, __m128d __b)
749 {
750  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
751 }
752 
753 /// Compares the lower double-precision floating-point values in each of
754 /// the two 128-bit floating-point vectors of [2 x double] to determine if
755 /// the value in the first parameter is greater than the corresponding value
756 /// in the second parameter.
757 ///
758 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
759 ///
760 /// \headerfile <x86intrin.h>
761 ///
762 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
763 ///
764 /// \param __a
765 /// A 128-bit vector of [2 x double]. The lower double-precision value is
766 /// compared to the lower double-precision value of \a __b.
767 /// \param __b
768 /// A 128-bit vector of [2 x double]. The lower double-precision value is
769 /// compared to the lower double-precision value of \a __a.
770 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
771 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
772 static __inline__ __m128d __DEFAULT_FN_ATTRS
773 _mm_cmpgt_sd(__m128d __a, __m128d __b)
774 {
775  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
776  return __extension__ (__m128d) { __c[0], __a[1] };
777 }
778 
779 /// Compares the lower double-precision floating-point values in each of
780 /// the two 128-bit floating-point vectors of [2 x double] to determine if
781 /// the value in the first parameter is greater than or equal to the
782 /// corresponding value in the second parameter.
783 ///
784 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
785 ///
786 /// \headerfile <x86intrin.h>
787 ///
788 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
789 ///
790 /// \param __a
791 /// A 128-bit vector of [2 x double]. The lower double-precision value is
792 /// compared to the lower double-precision value of \a __b.
793 /// \param __b
794 /// A 128-bit vector of [2 x double]. The lower double-precision value is
795 /// compared to the lower double-precision value of \a __a.
796 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
797 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
798 static __inline__ __m128d __DEFAULT_FN_ATTRS
799 _mm_cmpge_sd(__m128d __a, __m128d __b)
800 {
801  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
802  return __extension__ (__m128d) { __c[0], __a[1] };
803 }
804 
805 /// Compares the lower double-precision floating-point values in each of
806 /// the two 128-bit floating-point vectors of [2 x double] to determine if
807 /// the value in the first parameter is "ordered" with respect to the
808 /// corresponding value in the second parameter.
809 ///
810 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
811 /// of double-precision values are "ordered" with respect to each other if
812 /// neither value is a NaN.
813 ///
814 /// \headerfile <x86intrin.h>
815 ///
816 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
817 ///
818 /// \param __a
819 /// A 128-bit vector of [2 x double]. The lower double-precision value is
820 /// compared to the lower double-precision value of \a __b.
821 /// \param __b
822 /// A 128-bit vector of [2 x double]. The lower double-precision value is
823 /// compared to the lower double-precision value of \a __a.
824 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
825 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
826 static __inline__ __m128d __DEFAULT_FN_ATTRS
827 _mm_cmpord_sd(__m128d __a, __m128d __b)
828 {
829  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
830 }
831 
832 /// Compares the lower double-precision floating-point values in each of
833 /// the two 128-bit floating-point vectors of [2 x double] to determine if
834 /// the value in the first parameter is "unordered" with respect to the
835 /// corresponding value in the second parameter.
836 ///
837 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
838 /// of double-precision values are "unordered" with respect to each other if
839 /// one or both values are NaN.
840 ///
841 /// \headerfile <x86intrin.h>
842 ///
843 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
844 /// instruction.
845 ///
846 /// \param __a
847 /// A 128-bit vector of [2 x double]. The lower double-precision value is
848 /// compared to the lower double-precision value of \a __b.
849 /// \param __b
850 /// A 128-bit vector of [2 x double]. The lower double-precision value is
851 /// compared to the lower double-precision value of \a __a.
852 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
853 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
854 static __inline__ __m128d __DEFAULT_FN_ATTRS
855 _mm_cmpunord_sd(__m128d __a, __m128d __b)
856 {
857  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
858 }
859 
860 /// Compares the lower double-precision floating-point values in each of
861 /// the two 128-bit floating-point vectors of [2 x double] to determine if
862 /// the value in the first parameter is unequal to the corresponding value in
863 /// the second parameter.
864 ///
865 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
866 ///
867 /// \headerfile <x86intrin.h>
868 ///
869 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
870 ///
871 /// \param __a
872 /// A 128-bit vector of [2 x double]. The lower double-precision value is
873 /// compared to the lower double-precision value of \a __b.
874 /// \param __b
875 /// A 128-bit vector of [2 x double]. The lower double-precision value is
876 /// compared to the lower double-precision value of \a __a.
877 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
878 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
879 static __inline__ __m128d __DEFAULT_FN_ATTRS
880 _mm_cmpneq_sd(__m128d __a, __m128d __b)
881 {
882  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
883 }
884 
885 /// Compares the lower double-precision floating-point values in each of
886 /// the two 128-bit floating-point vectors of [2 x double] to determine if
887 /// the value in the first parameter is not less than the corresponding
888 /// value in the second parameter.
889 ///
890 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
891 ///
892 /// \headerfile <x86intrin.h>
893 ///
894 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
895 ///
896 /// \param __a
897 /// A 128-bit vector of [2 x double]. The lower double-precision value is
898 /// compared to the lower double-precision value of \a __b.
899 /// \param __b
900 /// A 128-bit vector of [2 x double]. The lower double-precision value is
901 /// compared to the lower double-precision value of \a __a.
902 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
903 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
904 static __inline__ __m128d __DEFAULT_FN_ATTRS
905 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
906 {
907  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
908 }
909 
910 /// Compares the lower double-precision floating-point values in each of
911 /// the two 128-bit floating-point vectors of [2 x double] to determine if
912 /// the value in the first parameter is not less than or equal to the
913 /// corresponding value in the second parameter.
914 ///
915 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
916 ///
917 /// \headerfile <x86intrin.h>
918 ///
919 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
920 ///
921 /// \param __a
922 /// A 128-bit vector of [2 x double]. The lower double-precision value is
923 /// compared to the lower double-precision value of \a __b.
924 /// \param __b
925 /// A 128-bit vector of [2 x double]. The lower double-precision value is
926 /// compared to the lower double-precision value of \a __a.
927 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
928 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
929 static __inline__ __m128d __DEFAULT_FN_ATTRS
930 _mm_cmpnle_sd(__m128d __a, __m128d __b)
931 {
932  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
933 }
934 
935 /// Compares the lower double-precision floating-point values in each of
936 /// the two 128-bit floating-point vectors of [2 x double] to determine if
937 /// the value in the first parameter is not greater than the corresponding
938 /// value in the second parameter.
939 ///
940 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
941 ///
942 /// \headerfile <x86intrin.h>
943 ///
944 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
945 ///
946 /// \param __a
947 /// A 128-bit vector of [2 x double]. The lower double-precision value is
948 /// compared to the lower double-precision value of \a __b.
949 /// \param __b
950 /// A 128-bit vector of [2 x double]. The lower double-precision value is
951 /// compared to the lower double-precision value of \a __a.
952 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
953 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
954 static __inline__ __m128d __DEFAULT_FN_ATTRS
955 _mm_cmpngt_sd(__m128d __a, __m128d __b)
956 {
957  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
958  return __extension__ (__m128d) { __c[0], __a[1] };
959 }
960 
961 /// Compares the lower double-precision floating-point values in each of
962 /// the two 128-bit floating-point vectors of [2 x double] to determine if
963 /// the value in the first parameter is not greater than or equal to the
964 /// corresponding value in the second parameter.
965 ///
966 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
967 ///
968 /// \headerfile <x86intrin.h>
969 ///
970 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
971 ///
972 /// \param __a
973 /// A 128-bit vector of [2 x double]. The lower double-precision value is
974 /// compared to the lower double-precision value of \a __b.
975 /// \param __b
976 /// A 128-bit vector of [2 x double]. The lower double-precision value is
977 /// compared to the lower double-precision value of \a __a.
978 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
979 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
980 static __inline__ __m128d __DEFAULT_FN_ATTRS
981 _mm_cmpnge_sd(__m128d __a, __m128d __b)
982 {
983  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
984  return __extension__ (__m128d) { __c[0], __a[1] };
985 }
986 
987 /// Compares the lower double-precision floating-point values in each of
988 /// the two 128-bit floating-point vectors of [2 x double] for equality.
989 ///
990 /// The comparison yields 0 for false, 1 for true. If either of the two
991 /// lower double-precision values is NaN, 0 is returned.
992 ///
993 /// \headerfile <x86intrin.h>
994 ///
995 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
996 ///
997 /// \param __a
998 /// A 128-bit vector of [2 x double]. The lower double-precision value is
999 /// compared to the lower double-precision value of \a __b.
1000 /// \param __b
1001 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1002 /// compared to the lower double-precision value of \a __a.
1003 /// \returns An integer containing the comparison results. If either of the two
1004 /// lower double-precision values is NaN, 0 is returned.
1005 static __inline__ int __DEFAULT_FN_ATTRS
1006 _mm_comieq_sd(__m128d __a, __m128d __b)
1007 {
1008  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1009 }
1010 
1011 /// Compares the lower double-precision floating-point values in each of
1012 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1013 /// the value in the first parameter is less than the corresponding value in
1014 /// the second parameter.
1015 ///
1016 /// The comparison yields 0 for false, 1 for true. If either of the two
1017 /// lower double-precision values is NaN, 0 is returned.
1018 ///
1019 /// \headerfile <x86intrin.h>
1020 ///
1021 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1022 ///
1023 /// \param __a
1024 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1025 /// compared to the lower double-precision value of \a __b.
1026 /// \param __b
1027 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1028 /// compared to the lower double-precision value of \a __a.
1029 /// \returns An integer containing the comparison results. If either of the two
1030 /// lower double-precision values is NaN, 0 is returned.
1031 static __inline__ int __DEFAULT_FN_ATTRS
1032 _mm_comilt_sd(__m128d __a, __m128d __b)
1033 {
1034  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1035 }
1036 
1037 /// Compares the lower double-precision floating-point values in each of
1038 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1039 /// the value in the first parameter is less than or equal to the
1040 /// corresponding value in the second parameter.
1041 ///
1042 /// The comparison yields 0 for false, 1 for true. If either of the two
1043 /// lower double-precision values is NaN, 0 is returned.
1044 ///
1045 /// \headerfile <x86intrin.h>
1046 ///
1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1048 ///
1049 /// \param __a
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __b.
1052 /// \param __b
1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1054 /// compared to the lower double-precision value of \a __a.
1055 /// \returns An integer containing the comparison results. If either of the two
1056 /// lower double-precision values is NaN, 0 is returned.
1057 static __inline__ int __DEFAULT_FN_ATTRS
1058 _mm_comile_sd(__m128d __a, __m128d __b)
1059 {
1060  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1061 }
1062 
1063 /// Compares the lower double-precision floating-point values in each of
1064 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1065 /// the value in the first parameter is greater than the corresponding value
1066 /// in the second parameter.
1067 ///
1068 /// The comparison yields 0 for false, 1 for true. If either of the two
1069 /// lower double-precision values is NaN, 0 is returned.
1070 ///
1071 /// \headerfile <x86intrin.h>
1072 ///
1073 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1074 ///
1075 /// \param __a
1076 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1077 /// compared to the lower double-precision value of \a __b.
1078 /// \param __b
1079 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1080 /// compared to the lower double-precision value of \a __a.
1081 /// \returns An integer containing the comparison results. If either of the two
1082 /// lower double-precision values is NaN, 0 is returned.
1083 static __inline__ int __DEFAULT_FN_ATTRS
1084 _mm_comigt_sd(__m128d __a, __m128d __b)
1085 {
1086  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1087 }
1088 
1089 /// Compares the lower double-precision floating-point values in each of
1090 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1091 /// the value in the first parameter is greater than or equal to the
1092 /// corresponding value in the second parameter.
1093 ///
1094 /// The comparison yields 0 for false, 1 for true. If either of the two
1095 /// lower double-precision values is NaN, 0 is returned.
1096 ///
1097 /// \headerfile <x86intrin.h>
1098 ///
1099 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1100 ///
1101 /// \param __a
1102 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1103 /// compared to the lower double-precision value of \a __b.
1104 /// \param __b
1105 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1106 /// compared to the lower double-precision value of \a __a.
1107 /// \returns An integer containing the comparison results. If either of the two
1108 /// lower double-precision values is NaN, 0 is returned.
1109 static __inline__ int __DEFAULT_FN_ATTRS
1110 _mm_comige_sd(__m128d __a, __m128d __b)
1111 {
1112  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1113 }
1114 
1115 /// Compares the lower double-precision floating-point values in each of
1116 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1117 /// the value in the first parameter is unequal to the corresponding value in
1118 /// the second parameter.
1119 ///
1120 /// The comparison yields 0 for false, 1 for true. If either of the two
1121 /// lower double-precision values is NaN, 1 is returned.
1122 ///
1123 /// \headerfile <x86intrin.h>
1124 ///
1125 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1126 ///
1127 /// \param __a
1128 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1129 /// compared to the lower double-precision value of \a __b.
1130 /// \param __b
1131 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1132 /// compared to the lower double-precision value of \a __a.
1133 /// \returns An integer containing the comparison results. If either of the two
1134 /// lower double-precision values is NaN, 1 is returned.
1135 static __inline__ int __DEFAULT_FN_ATTRS
1136 _mm_comineq_sd(__m128d __a, __m128d __b)
1137 {
1138  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1139 }
1140 
1141 /// Compares the lower double-precision floating-point values in each of
1142 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1143 /// comparison yields 0 for false, 1 for true.
1144 ///
1145 /// If either of the two lower double-precision values is NaN, 0 is returned.
1146 ///
1147 /// \headerfile <x86intrin.h>
1148 ///
1149 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1150 ///
1151 /// \param __a
1152 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1153 /// compared to the lower double-precision value of \a __b.
1154 /// \param __b
1155 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1156 /// compared to the lower double-precision value of \a __a.
1157 /// \returns An integer containing the comparison results. If either of the two
1158 /// lower double-precision values is NaN, 0 is returned.
1159 static __inline__ int __DEFAULT_FN_ATTRS
1160 _mm_ucomieq_sd(__m128d __a, __m128d __b)
1161 {
1162  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1163 }
1164 
1165 /// Compares the lower double-precision floating-point values in each of
1166 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1167 /// the value in the first parameter is less than the corresponding value in
1168 /// the second parameter.
1169 ///
1170 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1171 /// double-precision values is NaN, 0 is returned.
1172 ///
1173 /// \headerfile <x86intrin.h>
1174 ///
1175 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1176 ///
1177 /// \param __a
1178 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1179 /// compared to the lower double-precision value of \a __b.
1180 /// \param __b
1181 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1182 /// compared to the lower double-precision value of \a __a.
1183 /// \returns An integer containing the comparison results. If either of the two
1184 /// lower double-precision values is NaN, 0 is returned.
1185 static __inline__ int __DEFAULT_FN_ATTRS
1186 _mm_ucomilt_sd(__m128d __a, __m128d __b)
1187 {
1188  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1189 }
1190 
1191 /// Compares the lower double-precision floating-point values in each of
1192 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1193 /// the value in the first parameter is less than or equal to the
1194 /// corresponding value in the second parameter.
1195 ///
1196 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1197 /// double-precision values is NaN, 0 is returned.
1198 ///
1199 /// \headerfile <x86intrin.h>
1200 ///
1201 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202 ///
1203 /// \param __a
1204 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1205 /// compared to the lower double-precision value of \a __b.
1206 /// \param __b
1207 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1208 /// compared to the lower double-precision value of \a __a.
1209 /// \returns An integer containing the comparison results. If either of the two
1210 /// lower double-precision values is NaN, 0 is returned.
1211 static __inline__ int __DEFAULT_FN_ATTRS
1212 _mm_ucomile_sd(__m128d __a, __m128d __b)
1213 {
1214  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1215 }
1216 
1217 /// Compares the lower double-precision floating-point values in each of
1218 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1219 /// the value in the first parameter is greater than the corresponding value
1220 /// in the second parameter.
1221 ///
1222 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1223 /// double-precision values is NaN, 0 is returned.
1224 ///
1225 /// \headerfile <x86intrin.h>
1226 ///
1227 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1228 ///
1229 /// \param __a
1230 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1231 /// compared to the lower double-precision value of \a __b.
1232 /// \param __b
1233 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1234 /// compared to the lower double-precision value of \a __a.
1235 /// \returns An integer containing the comparison results. If either of the two
1236 /// lower double-precision values is NaN, 0 is returned.
1237 static __inline__ int __DEFAULT_FN_ATTRS
1238 _mm_ucomigt_sd(__m128d __a, __m128d __b)
1239 {
1240  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1241 }
1242 
1243 /// Compares the lower double-precision floating-point values in each of
1244 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1245 /// the value in the first parameter is greater than or equal to the
1246 /// corresponding value in the second parameter.
1247 ///
1248 /// The comparison yields 0 for false, 1 for true. If either of the two
1249 /// lower double-precision values is NaN, 0 is returned.
1250 ///
1251 /// \headerfile <x86intrin.h>
1252 ///
1253 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1254 ///
1255 /// \param __a
1256 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1257 /// compared to the lower double-precision value of \a __b.
1258 /// \param __b
1259 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1260 /// compared to the lower double-precision value of \a __a.
1261 /// \returns An integer containing the comparison results. If either of the two
1262 /// lower double-precision values is NaN, 0 is returned.
1263 static __inline__ int __DEFAULT_FN_ATTRS
1264 _mm_ucomige_sd(__m128d __a, __m128d __b)
1265 {
1266  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1267 }
1268 
1269 /// Compares the lower double-precision floating-point values in each of
1270 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1271 /// the value in the first parameter is unequal to the corresponding value in
1272 /// the second parameter.
1273 ///
1274 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1275 /// double-precision values is NaN, 1 is returned.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1280 ///
1281 /// \param __a
1282 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1283 /// compared to the lower double-precision value of \a __b.
1284 /// \param __b
1285 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1286 /// compared to the lower double-precision value of \a __a.
1287 /// \returns An integer containing the comparison result. If either of the two
1288 /// lower double-precision values is NaN, 1 is returned.
1289 static __inline__ int __DEFAULT_FN_ATTRS
1290 _mm_ucomineq_sd(__m128d __a, __m128d __b)
1291 {
1292  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1293 }
1294 
1295 /// Converts the two double-precision floating-point elements of a
1296 /// 128-bit vector of [2 x double] into two single-precision floating-point
1297 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1298 /// The upper 64 bits of the result vector are set to zero.
1299 ///
1300 /// \headerfile <x86intrin.h>
1301 ///
1302 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1303 ///
1304 /// \param __a
1305 /// A 128-bit vector of [2 x double].
1306 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1307 /// converted values. The upper 64 bits are set to zero.
1308 static __inline__ __m128 __DEFAULT_FN_ATTRS
1310 {
1311  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1312 }
1313 
1314 /// Converts the lower two single-precision floating-point elements of a
1315 /// 128-bit vector of [4 x float] into two double-precision floating-point
1316 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1317 /// elements of the input vector are unused.
1318 ///
1319 /// \headerfile <x86intrin.h>
1320 ///
1321 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1322 ///
1323 /// \param __a
1324 /// A 128-bit vector of [4 x float]. The lower two single-precision
1325 /// floating-point elements are converted to double-precision values. The
1326 /// upper two elements are unused.
1327 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1328 static __inline__ __m128d __DEFAULT_FN_ATTRS
1330 {
1331  return (__m128d) __builtin_convertvector(
1332  __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1333 }
1334 
1335 /// Converts the lower two integer elements of a 128-bit vector of
1336 /// [4 x i32] into two double-precision floating-point values, returned in a
1337 /// 128-bit vector of [2 x double].
1338 ///
1339 /// The upper two elements of the input vector are unused.
1340 ///
1341 /// \headerfile <x86intrin.h>
1342 ///
1343 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1344 ///
1345 /// \param __a
1346 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1347 /// converted to double-precision values.
1348 ///
1349 /// The upper two elements are unused.
1350 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1351 static __inline__ __m128d __DEFAULT_FN_ATTRS
1353 {
1354  return (__m128d) __builtin_convertvector(
1355  __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1356 }
1357 
1358 /// Converts the two double-precision floating-point elements of a
1359 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1360 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1361 /// 64 bits of the result vector are set to zero.
1362 ///
1363 /// \headerfile <x86intrin.h>
1364 ///
1365 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1366 ///
1367 /// \param __a
1368 /// A 128-bit vector of [2 x double].
1369 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1370 /// converted values. The upper 64 bits are set to zero.
1371 static __inline__ __m128i __DEFAULT_FN_ATTRS
1373 {
1374  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1375 }
1376 
1377 /// Converts the low-order element of a 128-bit vector of [2 x double]
1378 /// into a 32-bit signed integer value.
1379 ///
1380 /// \headerfile <x86intrin.h>
1381 ///
1382 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1383 ///
1384 /// \param __a
1385 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1386 /// conversion.
1387 /// \returns A 32-bit signed integer containing the converted value.
1388 static __inline__ int __DEFAULT_FN_ATTRS
1390 {
1391  return __builtin_ia32_cvtsd2si((__v2df)__a);
1392 }
1393 
1394 /// Converts the lower double-precision floating-point element of a
1395 /// 128-bit vector of [2 x double], in the second parameter, into a
1396 /// single-precision floating-point value, returned in the lower 32 bits of a
1397 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1398 /// copied from the upper 96 bits of the first parameter.
1399 ///
1400 /// \headerfile <x86intrin.h>
1401 ///
1402 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1403 ///
1404 /// \param __a
1405 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1406 /// copied to the upper 96 bits of the result.
1407 /// \param __b
1408 /// A 128-bit vector of [2 x double]. The lower double-precision
1409 /// floating-point element is used in the conversion.
1410 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1411 /// converted value from the second parameter. The upper 96 bits are copied
1412 /// from the upper 96 bits of the first parameter.
1413 static __inline__ __m128 __DEFAULT_FN_ATTRS
1414 _mm_cvtsd_ss(__m128 __a, __m128d __b)
1415 {
1416  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1417 }
1418 
1419 /// Converts a 32-bit signed integer value, in the second parameter, into
1420 /// a double-precision floating-point value, returned in the lower 64 bits of
1421 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1422 /// are copied from the upper 64 bits of the first parameter.
1423 ///
1424 /// \headerfile <x86intrin.h>
1425 ///
1426 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1427 ///
1428 /// \param __a
1429 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1430 /// copied to the upper 64 bits of the result.
1431 /// \param __b
1432 /// A 32-bit signed integer containing the value to be converted.
1433 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1434 /// converted value from the second parameter. The upper 64 bits are copied
1435 /// from the upper 64 bits of the first parameter.
1436 static __inline__ __m128d __DEFAULT_FN_ATTRS
1437 _mm_cvtsi32_sd(__m128d __a, int __b)
1438 {
1439  __a[0] = __b;
1440  return __a;
1441 }
1442 
1443 /// Converts the lower single-precision floating-point element of a
1444 /// 128-bit vector of [4 x float], in the second parameter, into a
1445 /// double-precision floating-point value, returned in the lower 64 bits of
1446 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1447 /// are copied from the upper 64 bits of the first parameter.
1448 ///
1449 /// \headerfile <x86intrin.h>
1450 ///
1451 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1452 ///
1453 /// \param __a
1454 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1455 /// copied to the upper 64 bits of the result.
1456 /// \param __b
1457 /// A 128-bit vector of [4 x float]. The lower single-precision
1458 /// floating-point element is used in the conversion.
1459 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1460 /// converted value from the second parameter. The upper 64 bits are copied
1461 /// from the upper 64 bits of the first parameter.
1462 static __inline__ __m128d __DEFAULT_FN_ATTRS
1463 _mm_cvtss_sd(__m128d __a, __m128 __b)
1464 {
1465  __a[0] = __b[0];
1466  return __a;
1467 }
1468 
1469 /// Converts the two double-precision floating-point elements of a
1470 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1471 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1472 ///
1473 /// If the result of either conversion is inexact, the result is truncated
1474 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1475 /// 64 bits of the result vector are set to zero.
1476 ///
1477 /// \headerfile <x86intrin.h>
1478 ///
1479 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1480 /// instruction.
1481 ///
1482 /// \param __a
1483 /// A 128-bit vector of [2 x double].
1484 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1485 /// converted values. The upper 64 bits are set to zero.
1486 static __inline__ __m128i __DEFAULT_FN_ATTRS
1488 {
1489  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1490 }
1491 
1492 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1493 /// signed integer value, truncating the result when it is inexact.
1494 ///
1495 /// \headerfile <x86intrin.h>
1496 ///
1497 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1498 /// instruction.
1499 ///
1500 /// \param __a
1501 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1502 /// conversion.
1503 /// \returns A 32-bit signed integer containing the converted value.
1504 static __inline__ int __DEFAULT_FN_ATTRS
1506 {
1507  return __builtin_ia32_cvttsd2si((__v2df)__a);
1508 }
1509 
1510 /// Converts the two double-precision floating-point elements of a
1511 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1512 /// returned in a 64-bit vector of [2 x i32].
1513 ///
1514 /// \headerfile <x86intrin.h>
1515 ///
1516 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1517 ///
1518 /// \param __a
1519 /// A 128-bit vector of [2 x double].
1520 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1521 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1523 {
1524  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1525 }
1526 
1527 /// Converts the two double-precision floating-point elements of a
1528 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1529 /// returned in a 64-bit vector of [2 x i32].
1530 ///
1531 /// If the result of either conversion is inexact, the result is truncated
1532 /// (rounded towards zero) regardless of the current MXCSR setting.
1533 ///
1534 /// \headerfile <x86intrin.h>
1535 ///
1536 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1537 ///
1538 /// \param __a
1539 /// A 128-bit vector of [2 x double].
1540 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1541 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1543 {
1544  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1545 }
1546 
1547 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1548 /// [2 x i32] into two double-precision floating-point values, returned in a
1549 /// 128-bit vector of [2 x double].
1550 ///
1551 /// \headerfile <x86intrin.h>
1552 ///
1553 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1554 ///
1555 /// \param __a
1556 /// A 64-bit vector of [2 x i32].
1557 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1558 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
1560 {
1561  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1562 }
1563 
1564 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1565 /// a double-precision floating-point value.
1566 ///
1567 /// \headerfile <x86intrin.h>
1568 ///
1569 /// This intrinsic has no corresponding instruction.
1570 ///
1571 /// \param __a
1572 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1573 /// \returns A double-precision floating-point value copied from the lower 64
1574 /// bits of \a __a.
1575 static __inline__ double __DEFAULT_FN_ATTRS
1577 {
1578  return __a[0];
1579 }
1580 
1581 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1582 /// memory location.
1583 ///
1584 /// \headerfile <x86intrin.h>
1585 ///
1586 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1587 ///
1588 /// \param __dp
1589 /// A pointer to a 128-bit memory location. The address of the memory
1590 /// location has to be 16-byte aligned.
1591 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1592 static __inline__ __m128d __DEFAULT_FN_ATTRS
1593 _mm_load_pd(double const *__dp)
1594 {
1595  return *(__m128d*)__dp;
1596 }
1597 
1598 /// Loads a double-precision floating-point value from a specified memory
1599 /// location and duplicates it to both vector elements of a 128-bit vector of
1600 /// [2 x double].
1601 ///
1602 /// \headerfile <x86intrin.h>
1603 ///
1604 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1605 ///
1606 /// \param __dp
1607 /// A pointer to a memory location containing a double-precision value.
1608 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1609 /// duplicated values.
1610 static __inline__ __m128d __DEFAULT_FN_ATTRS
1611 _mm_load1_pd(double const *__dp)
1612 {
1613  struct __mm_load1_pd_struct {
1614  double __u;
1615  } __attribute__((__packed__, __may_alias__));
1616  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
1617  return __extension__ (__m128d){ __u, __u };
1618 }
1619 
1620 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1621 
1622 /// Loads two double-precision values, in reverse order, from an aligned
1623 /// memory location into a 128-bit vector of [2 x double].
1624 ///
1625 /// \headerfile <x86intrin.h>
1626 ///
1627 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1628 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1629 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1630 ///
1631 /// \param __dp
1632 /// A 16-byte aligned pointer to an array of double-precision values to be
1633 /// loaded in reverse order.
1634 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1635 /// values.
1636 static __inline__ __m128d __DEFAULT_FN_ATTRS
1637 _mm_loadr_pd(double const *__dp)
1638 {
1639  __m128d __u = *(__m128d*)__dp;
1640  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1641 }
1642 
1643 /// Loads a 128-bit floating-point vector of [2 x double] from an
1644 /// unaligned memory location.
1645 ///
1646 /// \headerfile <x86intrin.h>
1647 ///
1648 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1649 ///
1650 /// \param __dp
1651 /// A pointer to a 128-bit memory location. The address of the memory
1652 /// location does not have to be aligned.
1653 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1654 static __inline__ __m128d __DEFAULT_FN_ATTRS
1655 _mm_loadu_pd(double const *__dp)
1656 {
1657  struct __loadu_pd {
1658  __m128d_u __v;
1659  } __attribute__((__packed__, __may_alias__));
1660  return ((struct __loadu_pd*)__dp)->__v;
1661 }
1662 
1663 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1664 /// vector and clears the upper element.
1665 ///
1666 /// \headerfile <x86intrin.h>
1667 ///
1668 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1669 ///
1670 /// \param __a
1671 /// A pointer to a 64-bit memory location. The address of the memory
1672 /// location does not have to be aligned.
1673 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1674 static __inline__ __m128i __DEFAULT_FN_ATTRS
1675 _mm_loadu_si64(void const *__a)
1676 {
1677  struct __loadu_si64 {
1678  long long __v;
1679  } __attribute__((__packed__, __may_alias__));
1680  long long __u = ((struct __loadu_si64*)__a)->__v;
1681  return __extension__ (__m128i)(__v2di){__u, 0LL};
1682 }
1683 
1684 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1685 /// vector and clears the upper element.
1686 ///
1687 /// \headerfile <x86intrin.h>
1688 ///
1689 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1690 ///
1691 /// \param __a
1692 /// A pointer to a 32-bit memory location. The address of the memory
1693 /// location does not have to be aligned.
1694 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1695 static __inline__ __m128i __DEFAULT_FN_ATTRS
1696 _mm_loadu_si32(void const *__a)
1697 {
1698  struct __loadu_si32 {
1699  int __v;
1700  } __attribute__((__packed__, __may_alias__));
1701  int __u = ((struct __loadu_si32*)__a)->__v;
1702  return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
1703 }
1704 
1705 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1706 /// vector and clears the upper element.
1707 ///
1708 /// \headerfile <x86intrin.h>
1709 ///
1710 /// This intrinsic does not correspond to a specific instruction.
1711 ///
1712 /// \param __a
1713 /// A pointer to a 16-bit memory location. The address of the memory
1714 /// location does not have to be aligned.
1715 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1716 static __inline__ __m128i __DEFAULT_FN_ATTRS
1717 _mm_loadu_si16(void const *__a)
1718 {
1719  struct __loadu_si16 {
1720  short __v;
1721  } __attribute__((__packed__, __may_alias__));
1722  short __u = ((struct __loadu_si16*)__a)->__v;
1723  return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1724 }
1725 
1726 /// Loads a 64-bit double-precision value to the low element of a
1727 /// 128-bit integer vector and clears the upper element.
1728 ///
1729 /// \headerfile <x86intrin.h>
1730 ///
1731 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1732 ///
1733 /// \param __dp
1734 /// A pointer to a memory location containing a double-precision value.
1735 /// The address of the memory location does not have to be aligned.
1736 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1737 static __inline__ __m128d __DEFAULT_FN_ATTRS
1738 _mm_load_sd(double const *__dp)
1739 {
1740  struct __mm_load_sd_struct {
1741  double __u;
1742  } __attribute__((__packed__, __may_alias__));
1743  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
1744  return __extension__ (__m128d){ __u, 0 };
1745 }
1746 
1747 /// Loads a double-precision value into the high-order bits of a 128-bit
1748 /// vector of [2 x double]. The low-order bits are copied from the low-order
1749 /// bits of the first operand.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1754 ///
1755 /// \param __a
1756 /// A 128-bit vector of [2 x double]. \n
1757 /// Bits [63:0] are written to bits [63:0] of the result.
1758 /// \param __dp
1759 /// A pointer to a 64-bit memory location containing a double-precision
1760 /// floating-point value that is loaded. The loaded value is written to bits
1761 /// [127:64] of the result. The address of the memory location does not have
1762 /// to be aligned.
1763 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1764 static __inline__ __m128d __DEFAULT_FN_ATTRS
1765 _mm_loadh_pd(__m128d __a, double const *__dp)
1766 {
1767  struct __mm_loadh_pd_struct {
1768  double __u;
1769  } __attribute__((__packed__, __may_alias__));
1770  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
1771  return __extension__ (__m128d){ __a[0], __u };
1772 }
1773 
1774 /// Loads a double-precision value into the low-order bits of a 128-bit
1775 /// vector of [2 x double]. The high-order bits are copied from the
1776 /// high-order bits of the first operand.
1777 ///
1778 /// \headerfile <x86intrin.h>
1779 ///
1780 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1781 ///
1782 /// \param __a
1783 /// A 128-bit vector of [2 x double]. \n
1784 /// Bits [127:64] are written to bits [127:64] of the result.
1785 /// \param __dp
1786 /// A pointer to a 64-bit memory location containing a double-precision
1787 /// floating-point value that is loaded. The loaded value is written to bits
1788 /// [63:0] of the result. The address of the memory location does not have to
1789 /// be aligned.
1790 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1791 static __inline__ __m128d __DEFAULT_FN_ATTRS
1792 _mm_loadl_pd(__m128d __a, double const *__dp)
1793 {
1794  struct __mm_loadl_pd_struct {
1795  double __u;
1796  } __attribute__((__packed__, __may_alias__));
1797  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
1798  return __extension__ (__m128d){ __u, __a[1] };
1799 }
1800 
1801 /// Constructs a 128-bit floating-point vector of [2 x double] with
1802 /// unspecified content. This could be used as an argument to another
1803 /// intrinsic function where the argument is required but the value is not
1804 /// actually used.
1805 ///
1806 /// \headerfile <x86intrin.h>
1807 ///
1808 /// This intrinsic has no corresponding instruction.
1809 ///
1810 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1811 /// content.
1812 static __inline__ __m128d __DEFAULT_FN_ATTRS
1814 {
1815  return (__m128d)__builtin_ia32_undef128();
1816 }
1817 
1818 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1819 /// 64 bits of the vector are initialized with the specified double-precision
1820 /// floating-point value. The upper 64 bits are set to zero.
1821 ///
1822 /// \headerfile <x86intrin.h>
1823 ///
1824 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1825 ///
1826 /// \param __w
1827 /// A double-precision floating-point value used to initialize the lower 64
1828 /// bits of the result.
1829 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1830 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1831 /// set to zero.
1832 static __inline__ __m128d __DEFAULT_FN_ATTRS
1833 _mm_set_sd(double __w)
1834 {
1835  return __extension__ (__m128d){ __w, 0 };
1836 }
1837 
1838 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1839 /// of the two double-precision floating-point vector elements set to the
1840 /// specified double-precision floating-point value.
1841 ///
1842 /// \headerfile <x86intrin.h>
1843 ///
1844 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1845 ///
1846 /// \param __w
1847 /// A double-precision floating-point value used to initialize each vector
1848 /// element of the result.
1849 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1850 static __inline__ __m128d __DEFAULT_FN_ATTRS
1851 _mm_set1_pd(double __w)
1852 {
1853  return __extension__ (__m128d){ __w, __w };
1854 }
1855 
1856 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1857 /// of the two double-precision floating-point vector elements set to the
1858 /// specified double-precision floating-point value.
1859 ///
1860 /// \headerfile <x86intrin.h>
1861 ///
1862 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1863 ///
1864 /// \param __w
1865 /// A double-precision floating-point value used to initialize each vector
1866 /// element of the result.
1867 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1868 static __inline__ __m128d __DEFAULT_FN_ATTRS
1869 _mm_set_pd1(double __w)
1870 {
1871  return _mm_set1_pd(__w);
1872 }
1873 
1874 /// Constructs a 128-bit floating-point vector of [2 x double]
1875 /// initialized with the specified double-precision floating-point values.
1876 ///
1877 /// \headerfile <x86intrin.h>
1878 ///
1879 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1880 ///
1881 /// \param __w
1882 /// A double-precision floating-point value used to initialize the upper 64
1883 /// bits of the result.
1884 /// \param __x
1885 /// A double-precision floating-point value used to initialize the lower 64
1886 /// bits of the result.
1887 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1888 static __inline__ __m128d __DEFAULT_FN_ATTRS
1889 _mm_set_pd(double __w, double __x)
1890 {
1891  return __extension__ (__m128d){ __x, __w };
1892 }
1893 
1894 /// Constructs a 128-bit floating-point vector of [2 x double],
1895 /// initialized in reverse order with the specified double-precision
1896 /// floating-point values.
1897 ///
1898 /// \headerfile <x86intrin.h>
1899 ///
1900 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1901 ///
1902 /// \param __w
1903 /// A double-precision floating-point value used to initialize the lower 64
1904 /// bits of the result.
1905 /// \param __x
1906 /// A double-precision floating-point value used to initialize the upper 64
1907 /// bits of the result.
1908 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1909 static __inline__ __m128d __DEFAULT_FN_ATTRS
1910 _mm_setr_pd(double __w, double __x)
1911 {
1912  return __extension__ (__m128d){ __w, __x };
1913 }
1914 
1915 /// Constructs a 128-bit floating-point vector of [2 x double]
1916 /// initialized to zero.
1917 ///
1918 /// \headerfile <x86intrin.h>
1919 ///
1920 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1921 ///
1922 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1923 /// all elements set to zero.
1924 static __inline__ __m128d __DEFAULT_FN_ATTRS
1926 {
1927  return __extension__ (__m128d){ 0, 0 };
1928 }
1929 
1930 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1931 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1932 /// 64 bits are set to the upper 64 bits of the first parameter.
1933 ///
1934 /// \headerfile <x86intrin.h>
1935 ///
1936 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1937 ///
1938 /// \param __a
1939 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1940 /// upper 64 bits of the result.
1941 /// \param __b
1942 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1943 /// lower 64 bits of the result.
1944 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1945 static __inline__ __m128d __DEFAULT_FN_ATTRS
1946 _mm_move_sd(__m128d __a, __m128d __b)
1947 {
1948  __a[0] = __b[0];
1949  return __a;
1950 }
1951 
1952 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1953 /// memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1958 ///
1959 /// \param __dp
1960 /// A pointer to a 64-bit memory location.
1961 /// \param __a
1962 /// A 128-bit vector of [2 x double] containing the value to be stored.
1963 static __inline__ void __DEFAULT_FN_ATTRS
1964 _mm_store_sd(double *__dp, __m128d __a)
1965 {
1966  struct __mm_store_sd_struct {
1967  double __u;
1968  } __attribute__((__packed__, __may_alias__));
1969  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
1970 }
1971 
1972 /// Moves packed double-precision values from a 128-bit vector of
1973 /// [2 x double] to a memory location.
1974 ///
1975 /// \headerfile <x86intrin.h>
1976 ///
1977 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1978 ///
1979 /// \param __dp
1980 /// A pointer to an aligned memory location that can store two
1981 /// double-precision values.
1982 /// \param __a
1983 /// A packed 128-bit vector of [2 x double] containing the values to be
1984 /// moved.
1985 static __inline__ void __DEFAULT_FN_ATTRS
1986 _mm_store_pd(double *__dp, __m128d __a)
1987 {
1988  *(__m128d*)__dp = __a;
1989 }
1990 
1991 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1992 /// the upper and lower 64 bits of a memory location.
1993 ///
1994 /// \headerfile <x86intrin.h>
1995 ///
1996 /// This intrinsic corresponds to the
1997 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1998 ///
1999 /// \param __dp
2000 /// A pointer to a memory location that can store two double-precision
2001 /// values.
2002 /// \param __a
2003 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2004 /// of the values in \a __dp.
2005 static __inline__ void __DEFAULT_FN_ATTRS
2006 _mm_store1_pd(double *__dp, __m128d __a)
2007 {
2008  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
2009  _mm_store_pd(__dp, __a);
2010 }
2011 
2012 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
2013 /// the upper and lower 64 bits of a memory location.
2014 ///
2015 /// \headerfile <x86intrin.h>
2016 ///
2017 /// This intrinsic corresponds to the
2018 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
2019 ///
2020 /// \param __dp
2021 /// A pointer to a memory location that can store two double-precision
2022 /// values.
2023 /// \param __a
2024 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
2025 /// of the values in \a __dp.
2026 static __inline__ void __DEFAULT_FN_ATTRS
2027 _mm_store_pd1(double *__dp, __m128d __a)
2028 {
2029  _mm_store1_pd(__dp, __a);
2030 }
2031 
2032 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
2033 /// location.
2034 ///
2035 /// \headerfile <x86intrin.h>
2036 ///
2037 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
2038 ///
2039 /// \param __dp
2040 /// A pointer to a 128-bit memory location. The address of the memory
2041 /// location does not have to be aligned.
2042 /// \param __a
2043 /// A 128-bit vector of [2 x double] containing the values to be stored.
2044 static __inline__ void __DEFAULT_FN_ATTRS
2045 _mm_storeu_pd(double *__dp, __m128d __a)
2046 {
2047  struct __storeu_pd {
2048  __m128d_u __v;
2049  } __attribute__((__packed__, __may_alias__));
2050  ((struct __storeu_pd*)__dp)->__v = __a;
2051 }
2052 
2053 /// Stores two double-precision values, in reverse order, from a 128-bit
2054 /// vector of [2 x double] to a 16-byte aligned memory location.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to a shuffling instruction followed by a
2059 /// <c> VMOVAPD / MOVAPD </c> instruction.
2060 ///
2061 /// \param __dp
2062 /// A pointer to a 16-byte aligned memory location that can store two
2063 /// double-precision values.
2064 /// \param __a
2065 /// A 128-bit vector of [2 x double] containing the values to be reversed and
2066 /// stored.
2067 static __inline__ void __DEFAULT_FN_ATTRS
2068 _mm_storer_pd(double *__dp, __m128d __a)
2069 {
2070  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2071  *(__m128d *)__dp = __a;
2072 }
2073 
2074 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2075 /// memory location.
2076 ///
2077 /// \headerfile <x86intrin.h>
2078 ///
2079 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2080 ///
2081 /// \param __dp
2082 /// A pointer to a 64-bit memory location.
2083 /// \param __a
2084 /// A 128-bit vector of [2 x double] containing the value to be stored.
2085 static __inline__ void __DEFAULT_FN_ATTRS
2086 _mm_storeh_pd(double *__dp, __m128d __a)
2087 {
2088  struct __mm_storeh_pd_struct {
2089  double __u;
2090  } __attribute__((__packed__, __may_alias__));
2091  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
2092 }
2093 
2094 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2095 /// memory location.
2096 ///
2097 /// \headerfile <x86intrin.h>
2098 ///
2099 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2100 ///
2101 /// \param __dp
2102 /// A pointer to a 64-bit memory location.
2103 /// \param __a
2104 /// A 128-bit vector of [2 x double] containing the value to be stored.
2105 static __inline__ void __DEFAULT_FN_ATTRS
2106 _mm_storel_pd(double *__dp, __m128d __a)
2107 {
2108  struct __mm_storeh_pd_struct {
2109  double __u;
2110  } __attribute__((__packed__, __may_alias__));
2111  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
2112 }
2113 
2114 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2115 /// saving the lower 8 bits of each sum in the corresponding element of a
2116 /// 128-bit result vector of [16 x i8].
2117 ///
2118 /// The integer elements of both parameters can be either signed or unsigned.
2119 ///
2120 /// \headerfile <x86intrin.h>
2121 ///
2122 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2123 ///
2124 /// \param __a
2125 /// A 128-bit vector of [16 x i8].
2126 /// \param __b
2127 /// A 128-bit vector of [16 x i8].
2128 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2129 /// parameters.
2130 static __inline__ __m128i __DEFAULT_FN_ATTRS
2131 _mm_add_epi8(__m128i __a, __m128i __b)
2132 {
2133  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2134 }
2135 
2136 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2137 /// saving the lower 16 bits of each sum in the corresponding element of a
2138 /// 128-bit result vector of [8 x i16].
2139 ///
2140 /// The integer elements of both parameters can be either signed or unsigned.
2141 ///
2142 /// \headerfile <x86intrin.h>
2143 ///
2144 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2145 ///
2146 /// \param __a
2147 /// A 128-bit vector of [8 x i16].
2148 /// \param __b
2149 /// A 128-bit vector of [8 x i16].
2150 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2151 /// parameters.
2152 static __inline__ __m128i __DEFAULT_FN_ATTRS
2153 _mm_add_epi16(__m128i __a, __m128i __b)
2154 {
2155  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2156 }
2157 
2158 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2159 /// saving the lower 32 bits of each sum in the corresponding element of a
2160 /// 128-bit result vector of [4 x i32].
2161 ///
2162 /// The integer elements of both parameters can be either signed or unsigned.
2163 ///
2164 /// \headerfile <x86intrin.h>
2165 ///
2166 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2167 ///
2168 /// \param __a
2169 /// A 128-bit vector of [4 x i32].
2170 /// \param __b
2171 /// A 128-bit vector of [4 x i32].
2172 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2173 /// parameters.
2174 static __inline__ __m128i __DEFAULT_FN_ATTRS
2175 _mm_add_epi32(__m128i __a, __m128i __b)
2176 {
2177  return (__m128i)((__v4su)__a + (__v4su)__b);
2178 }
2179 
2180 /// Adds two signed or unsigned 64-bit integer values, returning the
2181 /// lower 64 bits of the sum.
2182 ///
2183 /// \headerfile <x86intrin.h>
2184 ///
2185 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2186 ///
2187 /// \param __a
2188 /// A 64-bit integer.
2189 /// \param __b
2190 /// A 64-bit integer.
2191 /// \returns A 64-bit integer containing the sum of both parameters.
2192 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2193 _mm_add_si64(__m64 __a, __m64 __b)
2194 {
2195  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2196 }
2197 
2198 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2199 /// saving the lower 64 bits of each sum in the corresponding element of a
2200 /// 128-bit result vector of [2 x i64].
2201 ///
2202 /// The integer elements of both parameters can be either signed or unsigned.
2203 ///
2204 /// \headerfile <x86intrin.h>
2205 ///
2206 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2207 ///
2208 /// \param __a
2209 /// A 128-bit vector of [2 x i64].
2210 /// \param __b
2211 /// A 128-bit vector of [2 x i64].
2212 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2213 /// parameters.
2214 static __inline__ __m128i __DEFAULT_FN_ATTRS
2215 _mm_add_epi64(__m128i __a, __m128i __b)
2216 {
2217  return (__m128i)((__v2du)__a + (__v2du)__b);
2218 }
2219 
2220 /// Adds, with saturation, the corresponding elements of two 128-bit
2221 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2222 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2223 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2224 ///
2225 /// \headerfile <x86intrin.h>
2226 ///
2227 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2228 ///
2229 /// \param __a
2230 /// A 128-bit signed [16 x i8] vector.
2231 /// \param __b
2232 /// A 128-bit signed [16 x i8] vector.
2233 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2234 /// both parameters.
2235 static __inline__ __m128i __DEFAULT_FN_ATTRS
2236 _mm_adds_epi8(__m128i __a, __m128i __b)
2237 {
2238  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
2239 }
2240 
2241 /// Adds, with saturation, the corresponding elements of two 128-bit
2242 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2243 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2244 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2245 /// 0x8000.
2246 ///
2247 /// \headerfile <x86intrin.h>
2248 ///
2249 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2250 ///
2251 /// \param __a
2252 /// A 128-bit signed [8 x i16] vector.
2253 /// \param __b
2254 /// A 128-bit signed [8 x i16] vector.
2255 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2256 /// both parameters.
2257 static __inline__ __m128i __DEFAULT_FN_ATTRS
2258 _mm_adds_epi16(__m128i __a, __m128i __b)
2259 {
2260  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
2261 }
2262 
2263 /// Adds, with saturation, the corresponding elements of two 128-bit
2264 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2265 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2266 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2271 ///
2272 /// \param __a
2273 /// A 128-bit unsigned [16 x i8] vector.
2274 /// \param __b
2275 /// A 128-bit unsigned [16 x i8] vector.
2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2277 /// of both parameters.
2278 static __inline__ __m128i __DEFAULT_FN_ATTRS
2279 _mm_adds_epu8(__m128i __a, __m128i __b)
2280 {
2281  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
2282 }
2283 
2284 /// Adds, with saturation, the corresponding elements of two 128-bit
2285 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2286 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2287 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2288 ///
2289 /// \headerfile <x86intrin.h>
2290 ///
2291 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2292 ///
2293 /// \param __a
2294 /// A 128-bit unsigned [8 x i16] vector.
2295 /// \param __b
2296 /// A 128-bit unsigned [8 x i16] vector.
2297 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2298 /// of both parameters.
2299 static __inline__ __m128i __DEFAULT_FN_ATTRS
2300 _mm_adds_epu16(__m128i __a, __m128i __b)
2301 {
2302  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
2303 }
2304 
2305 /// Computes the rounded avarages of corresponding elements of two
2306 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2307 /// corresponding element of a 128-bit result vector of [16 x i8].
2308 ///
2309 /// \headerfile <x86intrin.h>
2310 ///
2311 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2312 ///
2313 /// \param __a
2314 /// A 128-bit unsigned [16 x i8] vector.
2315 /// \param __b
2316 /// A 128-bit unsigned [16 x i8] vector.
2317 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2318 /// averages of both parameters.
2319 static __inline__ __m128i __DEFAULT_FN_ATTRS
2320 _mm_avg_epu8(__m128i __a, __m128i __b)
2321 {
2322  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
2323  return (__m128i)__builtin_convertvector(
2324  ((__builtin_convertvector((__v16qu)__a, __v16hu) +
2325  __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
2326  >> 1, __v16qu);
2327 }
2328 
2329 /// Computes the rounded avarages of corresponding elements of two
2330 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2331 /// corresponding element of a 128-bit result vector of [8 x i16].
2332 ///
2333 /// \headerfile <x86intrin.h>
2334 ///
2335 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2336 ///
2337 /// \param __a
2338 /// A 128-bit unsigned [8 x i16] vector.
2339 /// \param __b
2340 /// A 128-bit unsigned [8 x i16] vector.
2341 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2342 /// averages of both parameters.
2343 static __inline__ __m128i __DEFAULT_FN_ATTRS
2344 _mm_avg_epu16(__m128i __a, __m128i __b)
2345 {
2346  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
2347  return (__m128i)__builtin_convertvector(
2348  ((__builtin_convertvector((__v8hu)__a, __v8su) +
2349  __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
2350  >> 1, __v8hu);
2351 }
2352 
2353 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2354 /// vectors, producing eight intermediate 32-bit signed integer products, and
2355 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2356 /// [4 x i32] vector.
2357 ///
2358 /// For example, bits [15:0] of both parameters are multiplied producing a
2359 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2360 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2361 /// of the result.
2362 ///
2363 /// \headerfile <x86intrin.h>
2364 ///
2365 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2366 ///
2367 /// \param __a
2368 /// A 128-bit signed [8 x i16] vector.
2369 /// \param __b
2370 /// A 128-bit signed [8 x i16] vector.
2371 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2372 /// of both parameters.
2373 static __inline__ __m128i __DEFAULT_FN_ATTRS
2374 _mm_madd_epi16(__m128i __a, __m128i __b)
2375 {
2376  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2377 }
2378 
2379 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2380 /// vectors, saving the greater value from each comparison in the
2381 /// corresponding element of a 128-bit result vector of [8 x i16].
2382 ///
2383 /// \headerfile <x86intrin.h>
2384 ///
2385 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2386 ///
2387 /// \param __a
2388 /// A 128-bit signed [8 x i16] vector.
2389 /// \param __b
2390 /// A 128-bit signed [8 x i16] vector.
2391 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2392 /// each comparison.
2393 static __inline__ __m128i __DEFAULT_FN_ATTRS
2394 _mm_max_epi16(__m128i __a, __m128i __b)
2395 {
2396  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
2397 }
2398 
2399 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2400 /// vectors, saving the greater value from each comparison in the
2401 /// corresponding element of a 128-bit result vector of [16 x i8].
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2406 ///
2407 /// \param __a
2408 /// A 128-bit unsigned [16 x i8] vector.
2409 /// \param __b
2410 /// A 128-bit unsigned [16 x i8] vector.
2411 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2412 /// each comparison.
2413 static __inline__ __m128i __DEFAULT_FN_ATTRS
2414 _mm_max_epu8(__m128i __a, __m128i __b)
2415 {
2416  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
2417 }
2418 
2419 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2420 /// vectors, saving the smaller value from each comparison in the
2421 /// corresponding element of a 128-bit result vector of [8 x i16].
2422 ///
2423 /// \headerfile <x86intrin.h>
2424 ///
2425 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2426 ///
2427 /// \param __a
2428 /// A 128-bit signed [8 x i16] vector.
2429 /// \param __b
2430 /// A 128-bit signed [8 x i16] vector.
2431 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2432 /// each comparison.
2433 static __inline__ __m128i __DEFAULT_FN_ATTRS
2434 _mm_min_epi16(__m128i __a, __m128i __b)
2435 {
2436  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
2437 }
2438 
2439 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2440 /// vectors, saving the smaller value from each comparison in the
2441 /// corresponding element of a 128-bit result vector of [16 x i8].
2442 ///
2443 /// \headerfile <x86intrin.h>
2444 ///
2445 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2446 ///
2447 /// \param __a
2448 /// A 128-bit unsigned [16 x i8] vector.
2449 /// \param __b
2450 /// A 128-bit unsigned [16 x i8] vector.
2451 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2452 /// each comparison.
2453 static __inline__ __m128i __DEFAULT_FN_ATTRS
2454 _mm_min_epu8(__m128i __a, __m128i __b)
2455 {
2456  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
2457 }
2458 
2459 /// Multiplies the corresponding elements of two signed [8 x i16]
2460 /// vectors, saving the upper 16 bits of each 32-bit product in the
2461 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2462 ///
2463 /// \headerfile <x86intrin.h>
2464 ///
2465 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2466 ///
2467 /// \param __a
2468 /// A 128-bit signed [8 x i16] vector.
2469 /// \param __b
2470 /// A 128-bit signed [8 x i16] vector.
2471 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2472 /// each of the eight 32-bit products.
2473 static __inline__ __m128i __DEFAULT_FN_ATTRS
2474 _mm_mulhi_epi16(__m128i __a, __m128i __b)
2475 {
2476  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2477 }
2478 
2479 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2480 /// vectors, saving the upper 16 bits of each 32-bit product in the
2481 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2482 ///
2483 /// \headerfile <x86intrin.h>
2484 ///
2485 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2486 ///
2487 /// \param __a
2488 /// A 128-bit unsigned [8 x i16] vector.
2489 /// \param __b
2490 /// A 128-bit unsigned [8 x i16] vector.
2491 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2492 /// of each of the eight 32-bit products.
2493 static __inline__ __m128i __DEFAULT_FN_ATTRS
2494 _mm_mulhi_epu16(__m128i __a, __m128i __b)
2495 {
2496  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2497 }
2498 
2499 /// Multiplies the corresponding elements of two signed [8 x i16]
2500 /// vectors, saving the lower 16 bits of each 32-bit product in the
2501 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2506 ///
2507 /// \param __a
2508 /// A 128-bit signed [8 x i16] vector.
2509 /// \param __b
2510 /// A 128-bit signed [8 x i16] vector.
2511 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2512 /// each of the eight 32-bit products.
2513 static __inline__ __m128i __DEFAULT_FN_ATTRS
2514 _mm_mullo_epi16(__m128i __a, __m128i __b)
2515 {
2516  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2517 }
2518 
2519 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2520 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2521 /// product.
2522 ///
2523 /// \headerfile <x86intrin.h>
2524 ///
2525 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2526 ///
2527 /// \param __a
2528 /// A 64-bit integer containing one of the source operands.
2529 /// \param __b
2530 /// A 64-bit integer containing one of the source operands.
2531 /// \returns A 64-bit integer vector containing the product of both operands.
2532 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2533 _mm_mul_su32(__m64 __a, __m64 __b)
2534 {
2535  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2536 }
2537 
2538 /// Multiplies 32-bit unsigned integer values contained in the lower
2539 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2540 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2541 ///
2542 /// \headerfile <x86intrin.h>
2543 ///
2544 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2545 ///
2546 /// \param __a
2547 /// A [2 x i64] vector containing one of the source operands.
2548 /// \param __b
2549 /// A [2 x i64] vector containing one of the source operands.
2550 /// \returns A [2 x i64] vector containing the product of both operands.
2551 static __inline__ __m128i __DEFAULT_FN_ATTRS
2552 _mm_mul_epu32(__m128i __a, __m128i __b)
2553 {
2554  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2555 }
2556 
2557 /// Computes the absolute differences of corresponding 8-bit integer
2558 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2559 /// separately sums the second 8 absolute differences. Packs these two
2560 /// unsigned 16-bit integer sums into the upper and lower elements of a
2561 /// [2 x i64] vector.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2566 ///
2567 /// \param __a
2568 /// A 128-bit integer vector containing one of the source operands.
2569 /// \param __b
2570 /// A 128-bit integer vector containing one of the source operands.
2571 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2572 /// differences between both operands.
2573 static __inline__ __m128i __DEFAULT_FN_ATTRS
2574 _mm_sad_epu8(__m128i __a, __m128i __b)
2575 {
2576  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2577 }
2578 
2579 /// Subtracts the corresponding 8-bit integer values in the operands.
2580 ///
2581 /// \headerfile <x86intrin.h>
2582 ///
2583 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2584 ///
2585 /// \param __a
2586 /// A 128-bit integer vector containing the minuends.
2587 /// \param __b
2588 /// A 128-bit integer vector containing the subtrahends.
2589 /// \returns A 128-bit integer vector containing the differences of the values
2590 /// in the operands.
2591 static __inline__ __m128i __DEFAULT_FN_ATTRS
2592 _mm_sub_epi8(__m128i __a, __m128i __b)
2593 {
2594  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2595 }
2596 
2597 /// Subtracts the corresponding 16-bit integer values in the operands.
2598 ///
2599 /// \headerfile <x86intrin.h>
2600 ///
2601 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2602 ///
2603 /// \param __a
2604 /// A 128-bit integer vector containing the minuends.
2605 /// \param __b
2606 /// A 128-bit integer vector containing the subtrahends.
2607 /// \returns A 128-bit integer vector containing the differences of the values
2608 /// in the operands.
2609 static __inline__ __m128i __DEFAULT_FN_ATTRS
2610 _mm_sub_epi16(__m128i __a, __m128i __b)
2611 {
2612  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2613 }
2614 
2615 /// Subtracts the corresponding 32-bit integer values in the operands.
2616 ///
2617 /// \headerfile <x86intrin.h>
2618 ///
2619 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2620 ///
2621 /// \param __a
2622 /// A 128-bit integer vector containing the minuends.
2623 /// \param __b
2624 /// A 128-bit integer vector containing the subtrahends.
2625 /// \returns A 128-bit integer vector containing the differences of the values
2626 /// in the operands.
2627 static __inline__ __m128i __DEFAULT_FN_ATTRS
2628 _mm_sub_epi32(__m128i __a, __m128i __b)
2629 {
2630  return (__m128i)((__v4su)__a - (__v4su)__b);
2631 }
2632 
2633 /// Subtracts signed or unsigned 64-bit integer values and writes the
2634 /// difference to the corresponding bits in the destination.
2635 ///
2636 /// \headerfile <x86intrin.h>
2637 ///
2638 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2639 ///
2640 /// \param __a
2641 /// A 64-bit integer vector containing the minuend.
2642 /// \param __b
2643 /// A 64-bit integer vector containing the subtrahend.
2644 /// \returns A 64-bit integer vector containing the difference of the values in
2645 /// the operands.
2646 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2647 _mm_sub_si64(__m64 __a, __m64 __b)
2648 {
2649  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2650 }
2651 
2652 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2653 ///
2654 /// \headerfile <x86intrin.h>
2655 ///
2656 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2657 ///
2658 /// \param __a
2659 /// A 128-bit integer vector containing the minuends.
2660 /// \param __b
2661 /// A 128-bit integer vector containing the subtrahends.
2662 /// \returns A 128-bit integer vector containing the differences of the values
2663 /// in the operands.
2664 static __inline__ __m128i __DEFAULT_FN_ATTRS
2665 _mm_sub_epi64(__m128i __a, __m128i __b)
2666 {
2667  return (__m128i)((__v2du)__a - (__v2du)__b);
2668 }
2669 
2670 /// Subtracts corresponding 8-bit signed integer values in the input and
2671 /// returns the differences in the corresponding bytes in the destination.
2672 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2673 /// than 0x80 are saturated to 0x80.
2674 ///
2675 /// \headerfile <x86intrin.h>
2676 ///
2677 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2678 ///
2679 /// \param __a
2680 /// A 128-bit integer vector containing the minuends.
2681 /// \param __b
2682 /// A 128-bit integer vector containing the subtrahends.
2683 /// \returns A 128-bit integer vector containing the differences of the values
2684 /// in the operands.
2685 static __inline__ __m128i __DEFAULT_FN_ATTRS
2686 _mm_subs_epi8(__m128i __a, __m128i __b)
2687 {
2688  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
2689 }
2690 
2691 /// Subtracts corresponding 16-bit signed integer values in the input and
2692 /// returns the differences in the corresponding bytes in the destination.
2693 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2694 /// than 0x8000 are saturated to 0x8000.
2695 ///
2696 /// \headerfile <x86intrin.h>
2697 ///
2698 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2699 ///
2700 /// \param __a
2701 /// A 128-bit integer vector containing the minuends.
2702 /// \param __b
2703 /// A 128-bit integer vector containing the subtrahends.
2704 /// \returns A 128-bit integer vector containing the differences of the values
2705 /// in the operands.
2706 static __inline__ __m128i __DEFAULT_FN_ATTRS
2707 _mm_subs_epi16(__m128i __a, __m128i __b)
2708 {
2709  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
2710 }
2711 
2712 /// Subtracts corresponding 8-bit unsigned integer values in the input
2713 /// and returns the differences in the corresponding bytes in the
2714 /// destination. Differences less than 0x00 are saturated to 0x00.
2715 ///
2716 /// \headerfile <x86intrin.h>
2717 ///
2718 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2719 ///
2720 /// \param __a
2721 /// A 128-bit integer vector containing the minuends.
2722 /// \param __b
2723 /// A 128-bit integer vector containing the subtrahends.
2724 /// \returns A 128-bit integer vector containing the unsigned integer
2725 /// differences of the values in the operands.
2726 static __inline__ __m128i __DEFAULT_FN_ATTRS
2727 _mm_subs_epu8(__m128i __a, __m128i __b)
2728 {
2729  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
2730 }
2731 
2732 /// Subtracts corresponding 16-bit unsigned integer values in the input
2733 /// and returns the differences in the corresponding bytes in the
2734 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2735 ///
2736 /// \headerfile <x86intrin.h>
2737 ///
2738 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2739 ///
2740 /// \param __a
2741 /// A 128-bit integer vector containing the minuends.
2742 /// \param __b
2743 /// A 128-bit integer vector containing the subtrahends.
2744 /// \returns A 128-bit integer vector containing the unsigned integer
2745 /// differences of the values in the operands.
2746 static __inline__ __m128i __DEFAULT_FN_ATTRS
2747 _mm_subs_epu16(__m128i __a, __m128i __b)
2748 {
2749  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
2750 }
2751 
2752 /// Performs a bitwise AND of two 128-bit integer vectors.
2753 ///
2754 /// \headerfile <x86intrin.h>
2755 ///
2756 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2757 ///
2758 /// \param __a
2759 /// A 128-bit integer vector containing one of the source operands.
2760 /// \param __b
2761 /// A 128-bit integer vector containing one of the source operands.
2762 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2763 /// in both operands.
2764 static __inline__ __m128i __DEFAULT_FN_ATTRS
2765 _mm_and_si128(__m128i __a, __m128i __b)
2766 {
2767  return (__m128i)((__v2du)__a & (__v2du)__b);
2768 }
2769 
2770 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2771 /// one's complement of the values contained in the first source operand.
2772 ///
2773 /// \headerfile <x86intrin.h>
2774 ///
2775 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2776 ///
2777 /// \param __a
2778 /// A 128-bit vector containing the left source operand. The one's complement
2779 /// of this value is used in the bitwise AND.
2780 /// \param __b
2781 /// A 128-bit vector containing the right source operand.
2782 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2783 /// complement of the first operand and the values in the second operand.
2784 static __inline__ __m128i __DEFAULT_FN_ATTRS
2785 _mm_andnot_si128(__m128i __a, __m128i __b)
2786 {
2787  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2788 }
2789 /// Performs a bitwise OR of two 128-bit integer vectors.
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2794 ///
2795 /// \param __a
2796 /// A 128-bit integer vector containing one of the source operands.
2797 /// \param __b
2798 /// A 128-bit integer vector containing one of the source operands.
2799 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2800 /// in both operands.
2801 static __inline__ __m128i __DEFAULT_FN_ATTRS
2802 _mm_or_si128(__m128i __a, __m128i __b)
2803 {
2804  return (__m128i)((__v2du)__a | (__v2du)__b);
2805 }
2806 
2807 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2808 ///
2809 /// \headerfile <x86intrin.h>
2810 ///
2811 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2812 ///
2813 /// \param __a
2814 /// A 128-bit integer vector containing one of the source operands.
2815 /// \param __b
2816 /// A 128-bit integer vector containing one of the source operands.
2817 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2818 /// values in both operands.
2819 static __inline__ __m128i __DEFAULT_FN_ATTRS
2820 _mm_xor_si128(__m128i __a, __m128i __b)
2821 {
2822  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2823 }
2824 
2825 /// Left-shifts the 128-bit integer vector operand by the specified
2826 /// number of bytes. Low-order bits are cleared.
2827 ///
2828 /// \headerfile <x86intrin.h>
2829 ///
2830 /// \code
2831 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2832 /// \endcode
2833 ///
2834 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2835 ///
2836 /// \param a
2837 /// A 128-bit integer vector containing the source operand.
2838 /// \param imm
2839 /// An immediate value specifying the number of bytes to left-shift operand
2840 /// \a a.
2841 /// \returns A 128-bit integer vector containing the left-shifted value.
2842 #define _mm_slli_si128(a, imm) \
2843  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2844 
2845 #define _mm_bslli_si128(a, imm) \
2846  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
2847 
2848 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2849 /// by the specified number of bits. Low-order bits are cleared.
2850 ///
2851 /// \headerfile <x86intrin.h>
2852 ///
2853 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2854 ///
2855 /// \param __a
2856 /// A 128-bit integer vector containing the source operand.
2857 /// \param __count
2858 /// An integer value specifying the number of bits to left-shift each value
2859 /// in operand \a __a.
2860 /// \returns A 128-bit integer vector containing the left-shifted values.
2861 static __inline__ __m128i __DEFAULT_FN_ATTRS
2862 _mm_slli_epi16(__m128i __a, int __count)
2863 {
2864  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2865 }
2866 
2867 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2868 /// by the specified number of bits. Low-order bits are cleared.
2869 ///
2870 /// \headerfile <x86intrin.h>
2871 ///
2872 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2873 ///
2874 /// \param __a
2875 /// A 128-bit integer vector containing the source operand.
2876 /// \param __count
2877 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2878 /// to left-shift each value in operand \a __a.
2879 /// \returns A 128-bit integer vector containing the left-shifted values.
2880 static __inline__ __m128i __DEFAULT_FN_ATTRS
2881 _mm_sll_epi16(__m128i __a, __m128i __count)
2882 {
2883  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2884 }
2885 
2886 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2887 /// by the specified number of bits. Low-order bits are cleared.
2888 ///
2889 /// \headerfile <x86intrin.h>
2890 ///
2891 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2892 ///
2893 /// \param __a
2894 /// A 128-bit integer vector containing the source operand.
2895 /// \param __count
2896 /// An integer value specifying the number of bits to left-shift each value
2897 /// in operand \a __a.
2898 /// \returns A 128-bit integer vector containing the left-shifted values.
2899 static __inline__ __m128i __DEFAULT_FN_ATTRS
2900 _mm_slli_epi32(__m128i __a, int __count)
2901 {
2902  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2903 }
2904 
2905 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2906 /// by the specified number of bits. Low-order bits are cleared.
2907 ///
2908 /// \headerfile <x86intrin.h>
2909 ///
2910 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2911 ///
2912 /// \param __a
2913 /// A 128-bit integer vector containing the source operand.
2914 /// \param __count
2915 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2916 /// to left-shift each value in operand \a __a.
2917 /// \returns A 128-bit integer vector containing the left-shifted values.
2918 static __inline__ __m128i __DEFAULT_FN_ATTRS
2919 _mm_sll_epi32(__m128i __a, __m128i __count)
2920 {
2921  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2922 }
2923 
2924 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2925 /// by the specified number of bits. Low-order bits are cleared.
2926 ///
2927 /// \headerfile <x86intrin.h>
2928 ///
2929 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2930 ///
2931 /// \param __a
2932 /// A 128-bit integer vector containing the source operand.
2933 /// \param __count
2934 /// An integer value specifying the number of bits to left-shift each value
2935 /// in operand \a __a.
2936 /// \returns A 128-bit integer vector containing the left-shifted values.
2937 static __inline__ __m128i __DEFAULT_FN_ATTRS
2938 _mm_slli_epi64(__m128i __a, int __count)
2939 {
2940  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2941 }
2942 
2943 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2944 /// by the specified number of bits. Low-order bits are cleared.
2945 ///
2946 /// \headerfile <x86intrin.h>
2947 ///
2948 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2949 ///
2950 /// \param __a
2951 /// A 128-bit integer vector containing the source operand.
2952 /// \param __count
2953 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2954 /// to left-shift each value in operand \a __a.
2955 /// \returns A 128-bit integer vector containing the left-shifted values.
2956 static __inline__ __m128i __DEFAULT_FN_ATTRS
2957 _mm_sll_epi64(__m128i __a, __m128i __count)
2958 {
2959  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2960 }
2961 
2962 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2963 /// by the specified number of bits. High-order bits are filled with the sign
2964 /// bit of the initial value.
2965 ///
2966 /// \headerfile <x86intrin.h>
2967 ///
2968 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2969 ///
2970 /// \param __a
2971 /// A 128-bit integer vector containing the source operand.
2972 /// \param __count
2973 /// An integer value specifying the number of bits to right-shift each value
2974 /// in operand \a __a.
2975 /// \returns A 128-bit integer vector containing the right-shifted values.
2976 static __inline__ __m128i __DEFAULT_FN_ATTRS
2977 _mm_srai_epi16(__m128i __a, int __count)
2978 {
2979  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2980 }
2981 
2982 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2983 /// by the specified number of bits. High-order bits are filled with the sign
2984 /// bit of the initial value.
2985 ///
2986 /// \headerfile <x86intrin.h>
2987 ///
2988 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2989 ///
2990 /// \param __a
2991 /// A 128-bit integer vector containing the source operand.
2992 /// \param __count
2993 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2994 /// to right-shift each value in operand \a __a.
2995 /// \returns A 128-bit integer vector containing the right-shifted values.
2996 static __inline__ __m128i __DEFAULT_FN_ATTRS
2997 _mm_sra_epi16(__m128i __a, __m128i __count)
2998 {
2999  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
3000 }
3001 
3002 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
3003 /// by the specified number of bits. High-order bits are filled with the sign
3004 /// bit of the initial value.
3005 ///
3006 /// \headerfile <x86intrin.h>
3007 ///
3008 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3009 ///
3010 /// \param __a
3011 /// A 128-bit integer vector containing the source operand.
3012 /// \param __count
3013 /// An integer value specifying the number of bits to right-shift each value
3014 /// in operand \a __a.
3015 /// \returns A 128-bit integer vector containing the right-shifted values.
3016 static __inline__ __m128i __DEFAULT_FN_ATTRS
3017 _mm_srai_epi32(__m128i __a, int __count)
3018 {
3019  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
3020 }
3021 
3022 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
3023 /// by the specified number of bits. High-order bits are filled with the sign
3024 /// bit of the initial value.
3025 ///
3026 /// \headerfile <x86intrin.h>
3027 ///
3028 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
3029 ///
3030 /// \param __a
3031 /// A 128-bit integer vector containing the source operand.
3032 /// \param __count
3033 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3034 /// to right-shift each value in operand \a __a.
3035 /// \returns A 128-bit integer vector containing the right-shifted values.
3036 static __inline__ __m128i __DEFAULT_FN_ATTRS
3037 _mm_sra_epi32(__m128i __a, __m128i __count)
3038 {
3039  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
3040 }
3041 
3042 /// Right-shifts the 128-bit integer vector operand by the specified
3043 /// number of bytes. High-order bits are cleared.
3044 ///
3045 /// \headerfile <x86intrin.h>
3046 ///
3047 /// \code
3048 /// __m128i _mm_srli_si128(__m128i a, const int imm);
3049 /// \endcode
3050 ///
3051 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
3052 ///
3053 /// \param a
3054 /// A 128-bit integer vector containing the source operand.
3055 /// \param imm
3056 /// An immediate value specifying the number of bytes to right-shift operand
3057 /// \a a.
3058 /// \returns A 128-bit integer vector containing the right-shifted value.
3059 #define _mm_srli_si128(a, imm) \
3060  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3061 
3062 #define _mm_bsrli_si128(a, imm) \
3063  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
3064 
3065 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3066 /// operand by the specified number of bits. High-order bits are cleared.
3067 ///
3068 /// \headerfile <x86intrin.h>
3069 ///
3070 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3071 ///
3072 /// \param __a
3073 /// A 128-bit integer vector containing the source operand.
3074 /// \param __count
3075 /// An integer value specifying the number of bits to right-shift each value
3076 /// in operand \a __a.
3077 /// \returns A 128-bit integer vector containing the right-shifted values.
3078 static __inline__ __m128i __DEFAULT_FN_ATTRS
3079 _mm_srli_epi16(__m128i __a, int __count)
3080 {
3081  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
3082 }
3083 
3084 /// Right-shifts each of 16-bit values in the 128-bit integer vector
3085 /// operand by the specified number of bits. High-order bits are cleared.
3086 ///
3087 /// \headerfile <x86intrin.h>
3088 ///
3089 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
3090 ///
3091 /// \param __a
3092 /// A 128-bit integer vector containing the source operand.
3093 /// \param __count
3094 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3095 /// to right-shift each value in operand \a __a.
3096 /// \returns A 128-bit integer vector containing the right-shifted values.
3097 static __inline__ __m128i __DEFAULT_FN_ATTRS
3098 _mm_srl_epi16(__m128i __a, __m128i __count)
3099 {
3100  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3101 }
3102 
3103 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3104 /// operand by the specified number of bits. High-order bits are cleared.
3105 ///
3106 /// \headerfile <x86intrin.h>
3107 ///
3108 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3109 ///
3110 /// \param __a
3111 /// A 128-bit integer vector containing the source operand.
3112 /// \param __count
3113 /// An integer value specifying the number of bits to right-shift each value
3114 /// in operand \a __a.
3115 /// \returns A 128-bit integer vector containing the right-shifted values.
3116 static __inline__ __m128i __DEFAULT_FN_ATTRS
3117 _mm_srli_epi32(__m128i __a, int __count)
3118 {
3119  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3120 }
3121 
3122 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3123 /// operand by the specified number of bits. High-order bits are cleared.
3124 ///
3125 /// \headerfile <x86intrin.h>
3126 ///
3127 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3128 ///
3129 /// \param __a
3130 /// A 128-bit integer vector containing the source operand.
3131 /// \param __count
3132 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3133 /// to right-shift each value in operand \a __a.
3134 /// \returns A 128-bit integer vector containing the right-shifted values.
3135 static __inline__ __m128i __DEFAULT_FN_ATTRS
3136 _mm_srl_epi32(__m128i __a, __m128i __count)
3137 {
3138  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3139 }
3140 
3141 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3142 /// operand by the specified number of bits. High-order bits are cleared.
3143 ///
3144 /// \headerfile <x86intrin.h>
3145 ///
3146 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3147 ///
3148 /// \param __a
3149 /// A 128-bit integer vector containing the source operand.
3150 /// \param __count
3151 /// An integer value specifying the number of bits to right-shift each value
3152 /// in operand \a __a.
3153 /// \returns A 128-bit integer vector containing the right-shifted values.
3154 static __inline__ __m128i __DEFAULT_FN_ATTRS
3155 _mm_srli_epi64(__m128i __a, int __count)
3156 {
3157  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3158 }
3159 
3160 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3161 /// operand by the specified number of bits. High-order bits are cleared.
3162 ///
3163 /// \headerfile <x86intrin.h>
3164 ///
3165 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3166 ///
3167 /// \param __a
3168 /// A 128-bit integer vector containing the source operand.
3169 /// \param __count
3170 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3171 /// to right-shift each value in operand \a __a.
3172 /// \returns A 128-bit integer vector containing the right-shifted values.
3173 static __inline__ __m128i __DEFAULT_FN_ATTRS
3174 _mm_srl_epi64(__m128i __a, __m128i __count)
3175 {
3176  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3177 }
3178 
3179 /// Compares each of the corresponding 8-bit values of the 128-bit
3180 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3181 /// for true.
3182 ///
3183 /// \headerfile <x86intrin.h>
3184 ///
3185 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3186 ///
3187 /// \param __a
3188 /// A 128-bit integer vector.
3189 /// \param __b
3190 /// A 128-bit integer vector.
3191 /// \returns A 128-bit integer vector containing the comparison results.
3192 static __inline__ __m128i __DEFAULT_FN_ATTRS
3193 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
3194 {
3195  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3196 }
3197 
3198 /// Compares each of the corresponding 16-bit values of the 128-bit
3199 /// integer vectors for equality. Each comparison yields 0x0 for false,
3200 /// 0xFFFF for true.
3201 ///
3202 /// \headerfile <x86intrin.h>
3203 ///
3204 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3205 ///
3206 /// \param __a
3207 /// A 128-bit integer vector.
3208 /// \param __b
3209 /// A 128-bit integer vector.
3210 /// \returns A 128-bit integer vector containing the comparison results.
3211 static __inline__ __m128i __DEFAULT_FN_ATTRS
3212 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
3213 {
3214  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3215 }
3216 
3217 /// Compares each of the corresponding 32-bit values of the 128-bit
3218 /// integer vectors for equality. Each comparison yields 0x0 for false,
3219 /// 0xFFFFFFFF for true.
3220 ///
3221 /// \headerfile <x86intrin.h>
3222 ///
3223 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3224 ///
3225 /// \param __a
3226 /// A 128-bit integer vector.
3227 /// \param __b
3228 /// A 128-bit integer vector.
3229 /// \returns A 128-bit integer vector containing the comparison results.
3230 static __inline__ __m128i __DEFAULT_FN_ATTRS
3231 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
3232 {
3233  return (__m128i)((__v4si)__a == (__v4si)__b);
3234 }
3235 
3236 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3237 /// integer vectors to determine if the values in the first operand are
3238 /// greater than those in the second operand. Each comparison yields 0x0 for
3239 /// false, 0xFF for true.
3240 ///
3241 /// \headerfile <x86intrin.h>
3242 ///
3243 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3244 ///
3245 /// \param __a
3246 /// A 128-bit integer vector.
3247 /// \param __b
3248 /// A 128-bit integer vector.
3249 /// \returns A 128-bit integer vector containing the comparison results.
3250 static __inline__ __m128i __DEFAULT_FN_ATTRS
3251 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
3252 {
3253  /* This function always performs a signed comparison, but __v16qi is a char
3254  which may be signed or unsigned, so use __v16qs. */
3255  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3256 }
3257 
3258 /// Compares each of the corresponding signed 16-bit values of the
3259 /// 128-bit integer vectors to determine if the values in the first operand
3260 /// are greater than those in the second operand.
3261 ///
3262 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3263 ///
3264 /// \headerfile <x86intrin.h>
3265 ///
3266 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3267 ///
3268 /// \param __a
3269 /// A 128-bit integer vector.
3270 /// \param __b
3271 /// A 128-bit integer vector.
3272 /// \returns A 128-bit integer vector containing the comparison results.
3273 static __inline__ __m128i __DEFAULT_FN_ATTRS
3274 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
3275 {
3276  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3277 }
3278 
3279 /// Compares each of the corresponding signed 32-bit values of the
3280 /// 128-bit integer vectors to determine if the values in the first operand
3281 /// are greater than those in the second operand.
3282 ///
3283 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3284 ///
3285 /// \headerfile <x86intrin.h>
3286 ///
3287 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3288 ///
3289 /// \param __a
3290 /// A 128-bit integer vector.
3291 /// \param __b
3292 /// A 128-bit integer vector.
3293 /// \returns A 128-bit integer vector containing the comparison results.
3294 static __inline__ __m128i __DEFAULT_FN_ATTRS
3295 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
3296 {
3297  return (__m128i)((__v4si)__a > (__v4si)__b);
3298 }
3299 
3300 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3301 /// integer vectors to determine if the values in the first operand are less
3302 /// than those in the second operand.
3303 ///
3304 /// Each comparison yields 0x0 for false, 0xFF for true.
3305 ///
3306 /// \headerfile <x86intrin.h>
3307 ///
3308 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3309 ///
3310 /// \param __a
3311 /// A 128-bit integer vector.
3312 /// \param __b
3313 /// A 128-bit integer vector.
3314 /// \returns A 128-bit integer vector containing the comparison results.
3315 static __inline__ __m128i __DEFAULT_FN_ATTRS
3316 _mm_cmplt_epi8(__m128i __a, __m128i __b)
3317 {
3318  return _mm_cmpgt_epi8(__b, __a);
3319 }
3320 
3321 /// Compares each of the corresponding signed 16-bit values of the
3322 /// 128-bit integer vectors to determine if the values in the first operand
3323 /// are less than those in the second operand.
3324 ///
3325 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3326 ///
3327 /// \headerfile <x86intrin.h>
3328 ///
3329 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3330 ///
3331 /// \param __a
3332 /// A 128-bit integer vector.
3333 /// \param __b
3334 /// A 128-bit integer vector.
3335 /// \returns A 128-bit integer vector containing the comparison results.
3336 static __inline__ __m128i __DEFAULT_FN_ATTRS
3337 _mm_cmplt_epi16(__m128i __a, __m128i __b)
3338 {
3339  return _mm_cmpgt_epi16(__b, __a);
3340 }
3341 
3342 /// Compares each of the corresponding signed 32-bit values of the
3343 /// 128-bit integer vectors to determine if the values in the first operand
3344 /// are less than those in the second operand.
3345 ///
3346 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3347 ///
3348 /// \headerfile <x86intrin.h>
3349 ///
3350 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3351 ///
3352 /// \param __a
3353 /// A 128-bit integer vector.
3354 /// \param __b
3355 /// A 128-bit integer vector.
3356 /// \returns A 128-bit integer vector containing the comparison results.
3357 static __inline__ __m128i __DEFAULT_FN_ATTRS
3358 _mm_cmplt_epi32(__m128i __a, __m128i __b)
3359 {
3360  return _mm_cmpgt_epi32(__b, __a);
3361 }
3362 
3363 #ifdef __x86_64__
3364 /// Converts a 64-bit signed integer value from the second operand into a
3365 /// double-precision value and returns it in the lower element of a [2 x
3366 /// double] vector; the upper element of the returned vector is copied from
3367 /// the upper element of the first operand.
3368 ///
3369 /// \headerfile <x86intrin.h>
3370 ///
3371 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3372 ///
3373 /// \param __a
3374 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3375 /// copied to the upper 64 bits of the destination.
3376 /// \param __b
3377 /// A 64-bit signed integer operand containing the value to be converted.
3378 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3379 /// converted value of the second operand. The upper 64 bits are copied from
3380 /// the upper 64 bits of the first operand.
3381 static __inline__ __m128d __DEFAULT_FN_ATTRS
3382 _mm_cvtsi64_sd(__m128d __a, long long __b)
3383 {
3384  __a[0] = __b;
3385  return __a;
3386 }
3387 
3388 /// Converts the first (lower) element of a vector of [2 x double] into a
3389 /// 64-bit signed integer value, according to the current rounding mode.
3390 ///
3391 /// \headerfile <x86intrin.h>
3392 ///
3393 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3394 ///
3395 /// \param __a
3396 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3397 /// conversion.
3398 /// \returns A 64-bit signed integer containing the converted value.
3399 static __inline__ long long __DEFAULT_FN_ATTRS
3400 _mm_cvtsd_si64(__m128d __a)
3401 {
3402  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3403 }
3404 
3405 /// Converts the first (lower) element of a vector of [2 x double] into a
3406 /// 64-bit signed integer value, truncating the result when it is inexact.
3407 ///
3408 /// \headerfile <x86intrin.h>
3409 ///
3410 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3411 /// instruction.
3412 ///
3413 /// \param __a
3414 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3415 /// conversion.
3416 /// \returns A 64-bit signed integer containing the converted value.
3417 static __inline__ long long __DEFAULT_FN_ATTRS
3418 _mm_cvttsd_si64(__m128d __a)
3419 {
3420  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3421 }
3422 #endif
3423 
3424 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3425 ///
3426 /// \headerfile <x86intrin.h>
3427 ///
3428 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3429 ///
3430 /// \param __a
3431 /// A 128-bit integer vector.
3432 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3433 static __inline__ __m128 __DEFAULT_FN_ATTRS
3434 _mm_cvtepi32_ps(__m128i __a)
3435 {
3436  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
3437 }
3438 
3439 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3440 ///
3441 /// \headerfile <x86intrin.h>
3442 ///
3443 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3444 ///
3445 /// \param __a
3446 /// A 128-bit vector of [4 x float].
3447 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3448 /// values.
3449 static __inline__ __m128i __DEFAULT_FN_ATTRS
3450 _mm_cvtps_epi32(__m128 __a)
3451 {
3452  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3453 }
3454 
3455 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3456 /// truncating the result when it is inexact.
3457 ///
3458 /// \headerfile <x86intrin.h>
3459 ///
3460 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3461 /// instruction.
3462 ///
3463 /// \param __a
3464 /// A 128-bit vector of [4 x float].
3465 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3466 static __inline__ __m128i __DEFAULT_FN_ATTRS
3467 _mm_cvttps_epi32(__m128 __a)
3468 {
3469  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3470 }
3471 
3472 /// Returns a vector of [4 x i32] where the lowest element is the input
3473 /// operand and the remaining elements are zero.
3474 ///
3475 /// \headerfile <x86intrin.h>
3476 ///
3477 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3478 ///
3479 /// \param __a
3480 /// A 32-bit signed integer operand.
3481 /// \returns A 128-bit vector of [4 x i32].
3482 static __inline__ __m128i __DEFAULT_FN_ATTRS
3484 {
3485  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
3486 }
3487 
3488 #ifdef __x86_64__
3489 /// Returns a vector of [2 x i64] where the lower element is the input
3490 /// operand and the upper element is zero.
3491 ///
3492 /// \headerfile <x86intrin.h>
3493 ///
3494 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3495 ///
3496 /// \param __a
3497 /// A 64-bit signed integer operand containing the value to be converted.
3498 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3499 static __inline__ __m128i __DEFAULT_FN_ATTRS
3500 _mm_cvtsi64_si128(long long __a)
3501 {
3502  return __extension__ (__m128i)(__v2di){ __a, 0 };
3503 }
3504 #endif
3505 
3506 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3507 /// 32-bit signed integer value.
3508 ///
3509 /// \headerfile <x86intrin.h>
3510 ///
3511 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3512 ///
3513 /// \param __a
3514 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3515 /// destination.
3516 /// \returns A 32-bit signed integer containing the moved value.
3517 static __inline__ int __DEFAULT_FN_ATTRS
3518 _mm_cvtsi128_si32(__m128i __a)
3519 {
3520  __v4si __b = (__v4si)__a;
3521  return __b[0];
3522 }
3523 
3524 #ifdef __x86_64__
3525 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3526 /// 64-bit signed integer value.
3527 ///
3528 /// \headerfile <x86intrin.h>
3529 ///
3530 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3531 ///
3532 /// \param __a
3533 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3534 /// destination.
3535 /// \returns A 64-bit signed integer containing the moved value.
3536 static __inline__ long long __DEFAULT_FN_ATTRS
3537 _mm_cvtsi128_si64(__m128i __a)
3538 {
3539  return __a[0];
3540 }
3541 #endif
3542 
3543 /// Moves packed integer values from an aligned 128-bit memory location
3544 /// to elements in a 128-bit integer vector.
3545 ///
3546 /// \headerfile <x86intrin.h>
3547 ///
3548 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3549 ///
3550 /// \param __p
3551 /// An aligned pointer to a memory location containing integer values.
3552 /// \returns A 128-bit integer vector containing the moved values.
3553 static __inline__ __m128i __DEFAULT_FN_ATTRS
3554 _mm_load_si128(__m128i const *__p)
3555 {
3556  return *__p;
3557 }
3558 
3559 /// Moves packed integer values from an unaligned 128-bit memory location
3560 /// to elements in a 128-bit integer vector.
3561 ///
3562 /// \headerfile <x86intrin.h>
3563 ///
3564 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3565 ///
3566 /// \param __p
3567 /// A pointer to a memory location containing integer values.
3568 /// \returns A 128-bit integer vector containing the moved values.
3569 static __inline__ __m128i __DEFAULT_FN_ATTRS
3570 _mm_loadu_si128(__m128i_u const *__p)
3571 {
3572  struct __loadu_si128 {
3573  __m128i_u __v;
3574  } __attribute__((__packed__, __may_alias__));
3575  return ((struct __loadu_si128*)__p)->__v;
3576 }
3577 
3578 /// Returns a vector of [2 x i64] where the lower element is taken from
3579 /// the lower element of the operand, and the upper element is zero.
3580 ///
3581 /// \headerfile <x86intrin.h>
3582 ///
3583 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3584 ///
3585 /// \param __p
3586 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3587 /// the destination.
3588 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3589 /// moved value. The higher order bits are cleared.
3590 static __inline__ __m128i __DEFAULT_FN_ATTRS
3591 _mm_loadl_epi64(__m128i_u const *__p)
3592 {
3593  struct __mm_loadl_epi64_struct {
3594  long long __u;
3595  } __attribute__((__packed__, __may_alias__));
3596  return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
3597 }
3598 
3599 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3600 /// This could be used as an argument to another intrinsic function where the
3601 /// argument is required but the value is not actually used.
3602 ///
3603 /// \headerfile <x86intrin.h>
3604 ///
3605 /// This intrinsic has no corresponding instruction.
3606 ///
3607 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS
3610 {
3611  return (__m128i)__builtin_ia32_undef128();
3612 }
3613 
3614 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3615 /// the specified 64-bit integer values.
3616 ///
3617 /// \headerfile <x86intrin.h>
3618 ///
3619 /// This intrinsic is a utility function and does not correspond to a specific
3620 /// instruction.
3621 ///
3622 /// \param __q1
3623 /// A 64-bit integer value used to initialize the upper 64 bits of the
3624 /// destination vector of [2 x i64].
3625 /// \param __q0
3626 /// A 64-bit integer value used to initialize the lower 64 bits of the
3627 /// destination vector of [2 x i64].
3628 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3629 /// provided in the operands.
3630 static __inline__ __m128i __DEFAULT_FN_ATTRS
3631 _mm_set_epi64x(long long __q1, long long __q0)
3632 {
3633  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
3634 }
3635 
3636 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3637 /// the specified 64-bit integer values.
3638 ///
3639 /// \headerfile <x86intrin.h>
3640 ///
3641 /// This intrinsic is a utility function and does not correspond to a specific
3642 /// instruction.
3643 ///
3644 /// \param __q1
3645 /// A 64-bit integer value used to initialize the upper 64 bits of the
3646 /// destination vector of [2 x i64].
3647 /// \param __q0
3648 /// A 64-bit integer value used to initialize the lower 64 bits of the
3649 /// destination vector of [2 x i64].
3650 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3651 /// provided in the operands.
3652 static __inline__ __m128i __DEFAULT_FN_ATTRS
3653 _mm_set_epi64(__m64 __q1, __m64 __q0)
3654 {
3655  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3656 }
3657 
3658 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3659 /// the specified 32-bit integer values.
3660 ///
3661 /// \headerfile <x86intrin.h>
3662 ///
3663 /// This intrinsic is a utility function and does not correspond to a specific
3664 /// instruction.
3665 ///
3666 /// \param __i3
3667 /// A 32-bit integer value used to initialize bits [127:96] of the
3668 /// destination vector.
3669 /// \param __i2
3670 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3671 /// vector.
3672 /// \param __i1
3673 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3674 /// vector.
3675 /// \param __i0
3676 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3677 /// vector.
3678 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3679 /// provided in the operands.
3680 static __inline__ __m128i __DEFAULT_FN_ATTRS
3681 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
3682 {
3683  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
3684 }
3685 
3686 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3687 /// the specified 16-bit integer values.
3688 ///
3689 /// \headerfile <x86intrin.h>
3690 ///
3691 /// This intrinsic is a utility function and does not correspond to a specific
3692 /// instruction.
3693 ///
3694 /// \param __w7
3695 /// A 16-bit integer value used to initialize bits [127:112] of the
3696 /// destination vector.
3697 /// \param __w6
3698 /// A 16-bit integer value used to initialize bits [111:96] of the
3699 /// destination vector.
3700 /// \param __w5
3701 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3702 /// vector.
3703 /// \param __w4
3704 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3705 /// vector.
3706 /// \param __w3
3707 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3708 /// vector.
3709 /// \param __w2
3710 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3711 /// vector.
3712 /// \param __w1
3713 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3714 /// vector.
3715 /// \param __w0
3716 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3717 /// vector.
3718 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3719 /// provided in the operands.
3720 static __inline__ __m128i __DEFAULT_FN_ATTRS
3721 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
3722 {
3723  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
3724 }
3725 
3726 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3727 /// the specified 8-bit integer values.
3728 ///
3729 /// \headerfile <x86intrin.h>
3730 ///
3731 /// This intrinsic is a utility function and does not correspond to a specific
3732 /// instruction.
3733 ///
3734 /// \param __b15
3735 /// Initializes bits [127:120] of the destination vector.
3736 /// \param __b14
3737 /// Initializes bits [119:112] of the destination vector.
3738 /// \param __b13
3739 /// Initializes bits [111:104] of the destination vector.
3740 /// \param __b12
3741 /// Initializes bits [103:96] of the destination vector.
3742 /// \param __b11
3743 /// Initializes bits [95:88] of the destination vector.
3744 /// \param __b10
3745 /// Initializes bits [87:80] of the destination vector.
3746 /// \param __b9
3747 /// Initializes bits [79:72] of the destination vector.
3748 /// \param __b8
3749 /// Initializes bits [71:64] of the destination vector.
3750 /// \param __b7
3751 /// Initializes bits [63:56] of the destination vector.
3752 /// \param __b6
3753 /// Initializes bits [55:48] of the destination vector.
3754 /// \param __b5
3755 /// Initializes bits [47:40] of the destination vector.
3756 /// \param __b4
3757 /// Initializes bits [39:32] of the destination vector.
3758 /// \param __b3
3759 /// Initializes bits [31:24] of the destination vector.
3760 /// \param __b2
3761 /// Initializes bits [23:16] of the destination vector.
3762 /// \param __b1
3763 /// Initializes bits [15:8] of the destination vector.
3764 /// \param __b0
3765 /// Initializes bits [7:0] of the destination vector.
3766 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3767 /// provided in the operands.
3768 static __inline__ __m128i __DEFAULT_FN_ATTRS
3769 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
3770 {
3771  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
3772 }
3773 
3774 /// Initializes both values in a 128-bit integer vector with the
3775 /// specified 64-bit integer value.
3776 ///
3777 /// \headerfile <x86intrin.h>
3778 ///
3779 /// This intrinsic is a utility function and does not correspond to a specific
3780 /// instruction.
3781 ///
3782 /// \param __q
3783 /// Integer value used to initialize the elements of the destination integer
3784 /// vector.
3785 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3786 /// elements containing the value provided in the operand.
3787 static __inline__ __m128i __DEFAULT_FN_ATTRS
3788 _mm_set1_epi64x(long long __q)
3789 {
3790  return _mm_set_epi64x(__q, __q);
3791 }
3792 
3793 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3794 /// specified 64-bit value.
3795 ///
3796 /// \headerfile <x86intrin.h>
3797 ///
3798 /// This intrinsic is a utility function and does not correspond to a specific
3799 /// instruction.
3800 ///
3801 /// \param __q
3802 /// A 64-bit value used to initialize the elements of the destination integer
3803 /// vector.
3804 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3805 /// containing the value provided in the operand.
3806 static __inline__ __m128i __DEFAULT_FN_ATTRS
3807 _mm_set1_epi64(__m64 __q)
3808 {
3809  return _mm_set_epi64(__q, __q);
3810 }
3811 
3812 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3813 /// specified 32-bit value.
3814 ///
3815 /// \headerfile <x86intrin.h>
3816 ///
3817 /// This intrinsic is a utility function and does not correspond to a specific
3818 /// instruction.
3819 ///
3820 /// \param __i
3821 /// A 32-bit value used to initialize the elements of the destination integer
3822 /// vector.
3823 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3824 /// containing the value provided in the operand.
3825 static __inline__ __m128i __DEFAULT_FN_ATTRS
3827 {
3828  return _mm_set_epi32(__i, __i, __i, __i);
3829 }
3830 
3831 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3832 /// specified 16-bit value.
3833 ///
3834 /// \headerfile <x86intrin.h>
3835 ///
3836 /// This intrinsic is a utility function and does not correspond to a specific
3837 /// instruction.
3838 ///
3839 /// \param __w
3840 /// A 16-bit value used to initialize the elements of the destination integer
3841 /// vector.
3842 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3843 /// containing the value provided in the operand.
3844 static __inline__ __m128i __DEFAULT_FN_ATTRS
3845 _mm_set1_epi16(short __w)
3846 {
3847  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3848 }
3849 
3850 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3851 /// specified 8-bit value.
3852 ///
3853 /// \headerfile <x86intrin.h>
3854 ///
3855 /// This intrinsic is a utility function and does not correspond to a specific
3856 /// instruction.
3857 ///
3858 /// \param __b
3859 /// An 8-bit value used to initialize the elements of the destination integer
3860 /// vector.
3861 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3862 /// containing the value provided in the operand.
3863 static __inline__ __m128i __DEFAULT_FN_ATTRS
3864 _mm_set1_epi8(char __b)
3865 {
3866  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
3867 }
3868 
3869 /// Constructs a 128-bit integer vector, initialized in reverse order
3870 /// with the specified 64-bit integral values.
3871 ///
3872 /// \headerfile <x86intrin.h>
3873 ///
3874 /// This intrinsic does not correspond to a specific instruction.
3875 ///
3876 /// \param __q0
3877 /// A 64-bit integral value used to initialize the lower 64 bits of the
3878 /// result.
3879 /// \param __q1
3880 /// A 64-bit integral value used to initialize the upper 64 bits of the
3881 /// result.
3882 /// \returns An initialized 128-bit integer vector.
3883 static __inline__ __m128i __DEFAULT_FN_ATTRS
3884 _mm_setr_epi64(__m64 __q0, __m64 __q1)
3885 {
3886  return _mm_set_epi64(__q1, __q0);
3887 }
3888 
3889 /// Constructs a 128-bit integer vector, initialized in reverse order
3890 /// with the specified 32-bit integral values.
3891 ///
3892 /// \headerfile <x86intrin.h>
3893 ///
3894 /// This intrinsic is a utility function and does not correspond to a specific
3895 /// instruction.
3896 ///
3897 /// \param __i0
3898 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3899 /// \param __i1
3900 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3901 /// \param __i2
3902 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3903 /// \param __i3
3904 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3905 /// \returns An initialized 128-bit integer vector.
3906 static __inline__ __m128i __DEFAULT_FN_ATTRS
3907 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
3908 {
3909  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3910 }
3911 
3912 /// Constructs a 128-bit integer vector, initialized in reverse order
3913 /// with the specified 16-bit integral values.
3914 ///
3915 /// \headerfile <x86intrin.h>
3916 ///
3917 /// This intrinsic is a utility function and does not correspond to a specific
3918 /// instruction.
3919 ///
3920 /// \param __w0
3921 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3922 /// \param __w1
3923 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3924 /// \param __w2
3925 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3926 /// \param __w3
3927 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3928 /// \param __w4
3929 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3930 /// \param __w5
3931 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3932 /// \param __w6
3933 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3934 /// \param __w7
3935 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3936 /// \returns An initialized 128-bit integer vector.
3937 static __inline__ __m128i __DEFAULT_FN_ATTRS
3938 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
3939 {
3940  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3941 }
3942 
3943 /// Constructs a 128-bit integer vector, initialized in reverse order
3944 /// with the specified 8-bit integral values.
3945 ///
3946 /// \headerfile <x86intrin.h>
3947 ///
3948 /// This intrinsic is a utility function and does not correspond to a specific
3949 /// instruction.
3950 ///
3951 /// \param __b0
3952 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3953 /// \param __b1
3954 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3955 /// \param __b2
3956 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3957 /// \param __b3
3958 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3959 /// \param __b4
3960 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3961 /// \param __b5
3962 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3963 /// \param __b6
3964 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3965 /// \param __b7
3966 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3967 /// \param __b8
3968 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3969 /// \param __b9
3970 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3971 /// \param __b10
3972 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3973 /// \param __b11
3974 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3975 /// \param __b12
3976 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3977 /// \param __b13
3978 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3979 /// \param __b14
3980 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3981 /// \param __b15
3982 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3983 /// \returns An initialized 128-bit integer vector.
3984 static __inline__ __m128i __DEFAULT_FN_ATTRS
3985 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
3986 {
3987  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3988 }
3989 
3990 /// Creates a 128-bit integer vector initialized to zero.
3991 ///
3992 /// \headerfile <x86intrin.h>
3993 ///
3994 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3995 ///
3996 /// \returns An initialized 128-bit integer vector with all elements set to
3997 /// zero.
3998 static __inline__ __m128i __DEFAULT_FN_ATTRS
4000 {
4001  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
4002 }
4003 
4004 /// Stores a 128-bit integer vector to a memory location aligned on a
4005 /// 128-bit boundary.
4006 ///
4007 /// \headerfile <x86intrin.h>
4008 ///
4009 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
4010 ///
4011 /// \param __p
4012 /// A pointer to an aligned memory location that will receive the integer
4013 /// values.
4014 /// \param __b
4015 /// A 128-bit integer vector containing the values to be moved.
4016 static __inline__ void __DEFAULT_FN_ATTRS
4017 _mm_store_si128(__m128i *__p, __m128i __b)
4018 {
4019  *__p = __b;
4020 }
4021 
4022 /// Stores a 128-bit integer vector to an unaligned memory location.
4023 ///
4024 /// \headerfile <x86intrin.h>
4025 ///
4026 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
4027 ///
4028 /// \param __p
4029 /// A pointer to a memory location that will receive the integer values.
4030 /// \param __b
4031 /// A 128-bit integer vector containing the values to be moved.
4032 static __inline__ void __DEFAULT_FN_ATTRS
4033 _mm_storeu_si128(__m128i_u *__p, __m128i __b)
4034 {
4035  struct __storeu_si128 {
4036  __m128i_u __v;
4037  } __attribute__((__packed__, __may_alias__));
4038  ((struct __storeu_si128*)__p)->__v = __b;
4039 }
4040 
4041 /// Stores a 64-bit integer value from the low element of a 128-bit integer
4042 /// vector.
4043 ///
4044 /// \headerfile <x86intrin.h>
4045 ///
4046 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4047 ///
4048 /// \param __p
4049 /// A pointer to a 64-bit memory location. The address of the memory
4050 /// location does not have to be algned.
4051 /// \param __b
4052 /// A 128-bit integer vector containing the value to be stored.
4053 static __inline__ void __DEFAULT_FN_ATTRS
4054 _mm_storeu_si64(void const *__p, __m128i __b)
4055 {
4056  struct __storeu_si64 {
4057  long long __v;
4058  } __attribute__((__packed__, __may_alias__));
4059  ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
4060 }
4061 
4062 /// Stores a 32-bit integer value from the low element of a 128-bit integer
4063 /// vector.
4064 ///
4065 /// \headerfile <x86intrin.h>
4066 ///
4067 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
4068 ///
4069 /// \param __p
4070 /// A pointer to a 32-bit memory location. The address of the memory
4071 /// location does not have to be aligned.
4072 /// \param __b
4073 /// A 128-bit integer vector containing the value to be stored.
4074 static __inline__ void __DEFAULT_FN_ATTRS
4075 _mm_storeu_si32(void const *__p, __m128i __b)
4076 {
4077  struct __storeu_si32 {
4078  int __v;
4079  } __attribute__((__packed__, __may_alias__));
4080  ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
4081 }
4082 
4083 /// Stores a 16-bit integer value from the low element of a 128-bit integer
4084 /// vector.
4085 ///
4086 /// \headerfile <x86intrin.h>
4087 ///
4088 /// This intrinsic does not correspond to a specific instruction.
4089 ///
4090 /// \param __p
4091 /// A pointer to a 16-bit memory location. The address of the memory
4092 /// location does not have to be aligned.
4093 /// \param __b
4094 /// A 128-bit integer vector containing the value to be stored.
4095 static __inline__ void __DEFAULT_FN_ATTRS
4096 _mm_storeu_si16(void const *__p, __m128i __b)
4097 {
4098  struct __storeu_si16 {
4099  short __v;
4100  } __attribute__((__packed__, __may_alias__));
4101  ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
4102 }
4103 
4104 /// Moves bytes selected by the mask from the first operand to the
4105 /// specified unaligned memory location. When a mask bit is 1, the
4106 /// corresponding byte is written, otherwise it is not written.
4107 ///
4108 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4109 /// used again soon). Exception and trap behavior for elements not selected
4110 /// for storage to memory are implementation dependent.
4111 ///
4112 /// \headerfile <x86intrin.h>
4113 ///
4114 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
4115 /// instruction.
4116 ///
4117 /// \param __d
4118 /// A 128-bit integer vector containing the values to be moved.
4119 /// \param __n
4120 /// A 128-bit integer vector containing the mask. The most significant bit of
4121 /// each byte represents the mask bits.
4122 /// \param __p
4123 /// A pointer to an unaligned 128-bit memory location where the specified
4124 /// values are moved.
4125 static __inline__ void __DEFAULT_FN_ATTRS
4126 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
4127 {
4128  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4129 }
4130 
4131 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4132 /// a memory location.
4133 ///
4134 /// \headerfile <x86intrin.h>
4135 ///
4136 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4137 ///
4138 /// \param __p
4139 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
4140 /// of the integer vector parameter.
4141 /// \param __a
4142 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4143 /// value to be stored.
4144 static __inline__ void __DEFAULT_FN_ATTRS
4145 _mm_storel_epi64(__m128i_u *__p, __m128i __a)
4146 {
4147  struct __mm_storel_epi64_struct {
4148  long long __u;
4149  } __attribute__((__packed__, __may_alias__));
4150  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
4151 }
4152 
4153 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4154 /// aligned memory location.
4155 ///
4156 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4157 /// used again soon).
4158 ///
4159 /// \headerfile <x86intrin.h>
4160 ///
4161 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4162 ///
4163 /// \param __p
4164 /// A pointer to the 128-bit aligned memory location used to store the value.
4165 /// \param __a
4166 /// A vector of [2 x double] containing the 64-bit values to be stored.
4167 static __inline__ void __DEFAULT_FN_ATTRS
4168 _mm_stream_pd(double *__p, __m128d __a)
4169 {
4170  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
4171 }
4172 
4173 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4174 ///
4175 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4176 /// used again soon).
4177 ///
4178 /// \headerfile <x86intrin.h>
4179 ///
4180 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4181 ///
4182 /// \param __p
4183 /// A pointer to the 128-bit aligned memory location used to store the value.
4184 /// \param __a
4185 /// A 128-bit integer vector containing the values to be stored.
4186 static __inline__ void __DEFAULT_FN_ATTRS
4187 _mm_stream_si128(__m128i *__p, __m128i __a)
4188 {
4189  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
4190 }
4191 
4192 /// Stores a 32-bit integer value in the specified memory location.
4193 ///
4194 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4195 /// used again soon).
4196 ///
4197 /// \headerfile <x86intrin.h>
4198 ///
4199 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4200 ///
4201 /// \param __p
4202 /// A pointer to the 32-bit memory location used to store the value.
4203 /// \param __a
4204 /// A 32-bit integer containing the value to be stored.
4205 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4206 _mm_stream_si32(int *__p, int __a)
4208  __builtin_ia32_movnti(__p, __a);
4209 }
4210 
4211 #ifdef __x86_64__
4212 /// Stores a 64-bit integer value in the specified memory location.
4213 ///
4214 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4215 /// used again soon).
4216 ///
4217 /// \headerfile <x86intrin.h>
4218 ///
4219 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4220 ///
4221 /// \param __p
4222 /// A pointer to the 64-bit memory location used to store the value.
4223 /// \param __a
4224 /// A 64-bit integer containing the value to be stored.
4225 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4226 _mm_stream_si64(long long *__p, long long __a)
4227 {
4228  __builtin_ia32_movnti64(__p, __a);
4229 }
4230 #endif
4231 
4232 #if defined(__cplusplus)
4233 extern "C" {
4234 #endif
4235 
4236 /// The cache line containing \a __p is flushed and invalidated from all
4237 /// caches in the coherency domain.
4238 ///
4239 /// \headerfile <x86intrin.h>
4240 ///
4241 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4242 ///
4243 /// \param __p
4244 /// A pointer to the memory location used to identify the cache line to be
4245 /// flushed.
4246 void _mm_clflush(void const * __p);
4247 
4248 /// Forces strong memory ordering (serialization) between load
4249 /// instructions preceding this instruction and load instructions following
4250 /// this instruction, ensuring the system completes all previous loads before
4251 /// executing subsequent loads.
4252 ///
4253 /// \headerfile <x86intrin.h>
4254 ///
4255 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4256 ///
4257 void _mm_lfence(void);
4258 
4259 /// Forces strong memory ordering (serialization) between load and store
4260 /// instructions preceding this instruction and load and store instructions
4261 /// following this instruction, ensuring that the system completes all
4262 /// previous memory accesses before executing subsequent memory accesses.
4263 ///
4264 /// \headerfile <x86intrin.h>
4265 ///
4266 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4267 ///
4268 void _mm_mfence(void);
4269 
4270 #if defined(__cplusplus)
4271 } // extern "C"
4272 #endif
4273 
4274 /// Converts 16-bit signed integers from both 128-bit integer vector
4275 /// operands into 8-bit signed integers, and packs the results into the
4276 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4277 /// Negative values less than 0x80 are saturated to 0x80.
4278 ///
4279 /// \headerfile <x86intrin.h>
4280 ///
4281 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4282 ///
4283 /// \param __a
4284 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4285 /// a signed integer and is converted to a 8-bit signed integer with
4286 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4287 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4288 /// written to the lower 64 bits of the result.
4289 /// \param __b
4290 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4291 /// a signed integer and is converted to a 8-bit signed integer with
4292 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4293 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4294 /// written to the higher 64 bits of the result.
4295 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4296 static __inline__ __m128i __DEFAULT_FN_ATTRS
4297 _mm_packs_epi16(__m128i __a, __m128i __b)
4298 {
4299  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4300 }
4301 
4302 /// Converts 32-bit signed integers from both 128-bit integer vector
4303 /// operands into 16-bit signed integers, and packs the results into the
4304 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4305 /// Negative values less than 0x8000 are saturated to 0x8000.
4306 ///
4307 /// \headerfile <x86intrin.h>
4308 ///
4309 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4310 ///
4311 /// \param __a
4312 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4313 /// a signed integer and is converted to a 16-bit signed integer with
4314 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4315 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4316 /// are written to the lower 64 bits of the result.
4317 /// \param __b
4318 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4319 /// a signed integer and is converted to a 16-bit signed integer with
4320 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4321 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4322 /// are written to the higher 64 bits of the result.
4323 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4324 static __inline__ __m128i __DEFAULT_FN_ATTRS
4325 _mm_packs_epi32(__m128i __a, __m128i __b)
4326 {
4327  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4328 }
4329 
4330 /// Converts 16-bit signed integers from both 128-bit integer vector
4331 /// operands into 8-bit unsigned integers, and packs the results into the
4332 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4333 /// than 0x00 are saturated to 0x00.
4334 ///
4335 /// \headerfile <x86intrin.h>
4336 ///
4337 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4338 ///
4339 /// \param __a
4340 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4341 /// a signed integer and is converted to an 8-bit unsigned integer with
4342 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4343 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4344 /// written to the lower 64 bits of the result.
4345 /// \param __b
4346 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4347 /// a signed integer and is converted to an 8-bit unsigned integer with
4348 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4349 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4350 /// written to the higher 64 bits of the result.
4351 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4352 static __inline__ __m128i __DEFAULT_FN_ATTRS
4353 _mm_packus_epi16(__m128i __a, __m128i __b)
4354 {
4355  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4356 }
4357 
4358 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4359 /// the immediate-value parameter as a selector.
4360 ///
4361 /// \headerfile <x86intrin.h>
4362 ///
4363 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4364 ///
4365 /// \param __a
4366 /// A 128-bit integer vector.
4367 /// \param __imm
4368 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned
4369 /// to bits[15:0] of the result. \n
4370 /// 000: assign values from bits [15:0] of \a __a. \n
4371 /// 001: assign values from bits [31:16] of \a __a. \n
4372 /// 010: assign values from bits [47:32] of \a __a. \n
4373 /// 011: assign values from bits [63:48] of \a __a. \n
4374 /// 100: assign values from bits [79:64] of \a __a. \n
4375 /// 101: assign values from bits [95:80] of \a __a. \n
4376 /// 110: assign values from bits [111:96] of \a __a. \n
4377 /// 111: assign values from bits [127:112] of \a __a.
4378 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4379 /// integer vector parameter and the remaining bits are assigned zeros.
4380 #define _mm_extract_epi16(a, imm) \
4381  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4382  (int)(imm))
4383 
4384 /// Constructs a 128-bit integer vector by first making a copy of the
4385 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4386 /// of an integer parameter into an offset specified by the immediate-value
4387 /// parameter.
4388 ///
4389 /// \headerfile <x86intrin.h>
4390 ///
4391 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4392 ///
4393 /// \param __a
4394 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4395 /// result and then one of the eight elements in the result is replaced by
4396 /// the lower 16 bits of \a __b.
4397 /// \param __b
4398 /// An integer. The lower 16 bits of this parameter are written to the
4399 /// result beginning at an offset specified by \a __imm.
4400 /// \param __imm
4401 /// An immediate value specifying the bit offset in the result at which the
4402 /// lower 16 bits of \a __b are written.
4403 /// \returns A 128-bit integer vector containing the constructed values.
4404 #define _mm_insert_epi16(a, b, imm) \
4405  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4406  (int)(imm))
4407 
4408 /// Copies the values of the most significant bits from each 8-bit
4409 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4410 /// value, zero-extends the value, and writes it to the destination.
4411 ///
4412 /// \headerfile <x86intrin.h>
4413 ///
4414 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4415 ///
4416 /// \param __a
4417 /// A 128-bit integer vector containing the values with bits to be extracted.
4418 /// \returns The most significant bits from each 8-bit element in \a __a,
4419 /// written to bits [15:0]. The other bits are assigned zeros.
4420 static __inline__ int __DEFAULT_FN_ATTRS
4421 _mm_movemask_epi8(__m128i __a)
4422 {
4423  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4424 }
4425 
4426 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4427 /// elements of a 128-bit integer vector parameter, using the immediate-value
4428 /// parameter as a specifier.
4429 ///
4430 /// \headerfile <x86intrin.h>
4431 ///
4432 /// \code
4433 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4434 /// \endcode
4435 ///
4436 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4437 ///
4438 /// \param a
4439 /// A 128-bit integer vector containing the values to be copied.
4440 /// \param imm
4441 /// An immediate value containing an 8-bit value specifying which elements to
4442 /// copy from a. The destinations within the 128-bit destination are assigned
4443 /// values as follows: \n
4444 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4445 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4446 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4447 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4448 /// Bit value assignments: \n
4449 /// 00: assign values from bits [31:0] of \a a. \n
4450 /// 01: assign values from bits [63:32] of \a a. \n
4451 /// 10: assign values from bits [95:64] of \a a. \n
4452 /// 11: assign values from bits [127:96] of \a a.
4453 /// \returns A 128-bit integer vector containing the shuffled values.
4454 #define _mm_shuffle_epi32(a, imm) \
4455  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
4456 
4457 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4458 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4459 /// value parameter as a specifier.
4460 ///
4461 /// \headerfile <x86intrin.h>
4462 ///
4463 /// \code
4464 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4465 /// \endcode
4466 ///
4467 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4468 ///
4469 /// \param a
4470 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4471 /// [127:64] of the result.
4472 /// \param imm
4473 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4474 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4475 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4476 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4477 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4478 /// Bit value assignments: \n
4479 /// 00: assign values from bits [15:0] of \a a. \n
4480 /// 01: assign values from bits [31:16] of \a a. \n
4481 /// 10: assign values from bits [47:32] of \a a. \n
4482 /// 11: assign values from bits [63:48] of \a a. \n
4483 /// \returns A 128-bit integer vector containing the shuffled values.
4484 #define _mm_shufflelo_epi16(a, imm) \
4485  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
4486 
4487 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4488 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4489 /// value parameter as a specifier.
4490 ///
4491 /// \headerfile <x86intrin.h>
4492 ///
4493 /// \code
4494 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4495 /// \endcode
4496 ///
4497 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4498 ///
4499 /// \param a
4500 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4501 /// [63:0] of the result.
4502 /// \param imm
4503 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4504 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4505 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4506 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4507 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4508 /// Bit value assignments: \n
4509 /// 00: assign values from bits [79:64] of \a a. \n
4510 /// 01: assign values from bits [95:80] of \a a. \n
4511 /// 10: assign values from bits [111:96] of \a a. \n
4512 /// 11: assign values from bits [127:112] of \a a. \n
4513 /// \returns A 128-bit integer vector containing the shuffled values.
4514 #define _mm_shufflehi_epi16(a, imm) \
4515  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
4516 
4517 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4518 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4519 ///
4520 /// \headerfile <x86intrin.h>
4521 ///
4522 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4523 /// instruction.
4524 ///
4525 /// \param __a
4526 /// A 128-bit vector of [16 x i8].
4527 /// Bits [71:64] are written to bits [7:0] of the result. \n
4528 /// Bits [79:72] are written to bits [23:16] of the result. \n
4529 /// Bits [87:80] are written to bits [39:32] of the result. \n
4530 /// Bits [95:88] are written to bits [55:48] of the result. \n
4531 /// Bits [103:96] are written to bits [71:64] of the result. \n
4532 /// Bits [111:104] are written to bits [87:80] of the result. \n
4533 /// Bits [119:112] are written to bits [103:96] of the result. \n
4534 /// Bits [127:120] are written to bits [119:112] of the result.
4535 /// \param __b
4536 /// A 128-bit vector of [16 x i8]. \n
4537 /// Bits [71:64] are written to bits [15:8] of the result. \n
4538 /// Bits [79:72] are written to bits [31:24] of the result. \n
4539 /// Bits [87:80] are written to bits [47:40] of the result. \n
4540 /// Bits [95:88] are written to bits [63:56] of the result. \n
4541 /// Bits [103:96] are written to bits [79:72] of the result. \n
4542 /// Bits [111:104] are written to bits [95:88] of the result. \n
4543 /// Bits [119:112] are written to bits [111:104] of the result. \n
4544 /// Bits [127:120] are written to bits [127:120] of the result.
4545 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4546 static __inline__ __m128i __DEFAULT_FN_ATTRS
4547 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
4548 {
4549  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
4550 }
4551 
4552 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4553 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4554 ///
4555 /// \headerfile <x86intrin.h>
4556 ///
4557 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4558 /// instruction.
4559 ///
4560 /// \param __a
4561 /// A 128-bit vector of [8 x i16].
4562 /// Bits [79:64] are written to bits [15:0] of the result. \n
4563 /// Bits [95:80] are written to bits [47:32] of the result. \n
4564 /// Bits [111:96] are written to bits [79:64] of the result. \n
4565 /// Bits [127:112] are written to bits [111:96] of the result.
4566 /// \param __b
4567 /// A 128-bit vector of [8 x i16].
4568 /// Bits [79:64] are written to bits [31:16] of the result. \n
4569 /// Bits [95:80] are written to bits [63:48] of the result. \n
4570 /// Bits [111:96] are written to bits [95:80] of the result. \n
4571 /// Bits [127:112] are written to bits [127:112] of the result.
4572 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4573 static __inline__ __m128i __DEFAULT_FN_ATTRS
4574 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
4575 {
4576  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
4577 }
4578 
4579 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4580 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4581 ///
4582 /// \headerfile <x86intrin.h>
4583 ///
4584 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4585 /// instruction.
4586 ///
4587 /// \param __a
4588 /// A 128-bit vector of [4 x i32]. \n
4589 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4590 /// Bits [127:96] are written to bits [95:64] of the destination.
4591 /// \param __b
4592 /// A 128-bit vector of [4 x i32]. \n
4593 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4594 /// Bits [127:96] are written to bits [127:96] of the destination.
4595 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4596 static __inline__ __m128i __DEFAULT_FN_ATTRS
4597 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
4598 {
4599  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
4600 }
4601 
4602 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4603 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4604 ///
4605 /// \headerfile <x86intrin.h>
4606 ///
4607 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4608 /// instruction.
4609 ///
4610 /// \param __a
4611 /// A 128-bit vector of [2 x i64]. \n
4612 /// Bits [127:64] are written to bits [63:0] of the destination.
4613 /// \param __b
4614 /// A 128-bit vector of [2 x i64]. \n
4615 /// Bits [127:64] are written to bits [127:64] of the destination.
4616 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4617 static __inline__ __m128i __DEFAULT_FN_ATTRS
4618 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
4619 {
4620  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
4621 }
4622 
4623 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4624 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4625 ///
4626 /// \headerfile <x86intrin.h>
4627 ///
4628 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4629 /// instruction.
4630 ///
4631 /// \param __a
4632 /// A 128-bit vector of [16 x i8]. \n
4633 /// Bits [7:0] are written to bits [7:0] of the result. \n
4634 /// Bits [15:8] are written to bits [23:16] of the result. \n
4635 /// Bits [23:16] are written to bits [39:32] of the result. \n
4636 /// Bits [31:24] are written to bits [55:48] of the result. \n
4637 /// Bits [39:32] are written to bits [71:64] of the result. \n
4638 /// Bits [47:40] are written to bits [87:80] of the result. \n
4639 /// Bits [55:48] are written to bits [103:96] of the result. \n
4640 /// Bits [63:56] are written to bits [119:112] of the result.
4641 /// \param __b
4642 /// A 128-bit vector of [16 x i8].
4643 /// Bits [7:0] are written to bits [15:8] of the result. \n
4644 /// Bits [15:8] are written to bits [31:24] of the result. \n
4645 /// Bits [23:16] are written to bits [47:40] of the result. \n
4646 /// Bits [31:24] are written to bits [63:56] of the result. \n
4647 /// Bits [39:32] are written to bits [79:72] of the result. \n
4648 /// Bits [47:40] are written to bits [95:88] of the result. \n
4649 /// Bits [55:48] are written to bits [111:104] of the result. \n
4650 /// Bits [63:56] are written to bits [127:120] of the result.
4651 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4652 static __inline__ __m128i __DEFAULT_FN_ATTRS
4653 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
4654 {
4655  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
4656 }
4657 
4658 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4659 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4660 /// [8 x i16].
4661 ///
4662 /// \headerfile <x86intrin.h>
4663 ///
4664 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4665 /// instruction.
4666 ///
4667 /// \param __a
4668 /// A 128-bit vector of [8 x i16].
4669 /// Bits [15:0] are written to bits [15:0] of the result. \n
4670 /// Bits [31:16] are written to bits [47:32] of the result. \n
4671 /// Bits [47:32] are written to bits [79:64] of the result. \n
4672 /// Bits [63:48] are written to bits [111:96] of the result.
4673 /// \param __b
4674 /// A 128-bit vector of [8 x i16].
4675 /// Bits [15:0] are written to bits [31:16] of the result. \n
4676 /// Bits [31:16] are written to bits [63:48] of the result. \n
4677 /// Bits [47:32] are written to bits [95:80] of the result. \n
4678 /// Bits [63:48] are written to bits [127:112] of the result.
4679 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4680 static __inline__ __m128i __DEFAULT_FN_ATTRS
4681 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
4682 {
4683  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
4684 }
4685 
4686 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4687 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4688 ///
4689 /// \headerfile <x86intrin.h>
4690 ///
4691 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4692 /// instruction.
4693 ///
4694 /// \param __a
4695 /// A 128-bit vector of [4 x i32]. \n
4696 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4697 /// Bits [63:32] are written to bits [95:64] of the destination.
4698 /// \param __b
4699 /// A 128-bit vector of [4 x i32]. \n
4700 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4701 /// Bits [63:32] are written to bits [127:96] of the destination.
4702 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4703 static __inline__ __m128i __DEFAULT_FN_ATTRS
4704 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
4705 {
4706  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
4707 }
4708 
4709 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4710 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4711 ///
4712 /// \headerfile <x86intrin.h>
4713 ///
4714 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4715 /// instruction.
4716 ///
4717 /// \param __a
4718 /// A 128-bit vector of [2 x i64]. \n
4719 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4720 /// \param __b
4721 /// A 128-bit vector of [2 x i64]. \n
4722 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4723 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4724 static __inline__ __m128i __DEFAULT_FN_ATTRS
4725 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
4726 {
4727  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
4728 }
4729 
4730 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4731 /// integer.
4732 ///
4733 /// \headerfile <x86intrin.h>
4734 ///
4735 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4736 ///
4737 /// \param __a
4738 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4739 /// destination.
4740 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4741 static __inline__ __m64 __DEFAULT_FN_ATTRS
4742 _mm_movepi64_pi64(__m128i __a)
4743 {
4744  return (__m64)__a[0];
4745 }
4746 
4747 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4748 /// upper bits.
4749 ///
4750 /// \headerfile <x86intrin.h>
4751 ///
4752 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4753 ///
4754 /// \param __a
4755 /// A 64-bit value.
4756 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4757 /// the operand. The upper 64 bits are assigned zeros.
4758 static __inline__ __m128i __DEFAULT_FN_ATTRS
4760 {
4761  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
4762 }
4763 
4764 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4765 /// integer vector, zeroing the upper bits.
4766 ///
4767 /// \headerfile <x86intrin.h>
4768 ///
4769 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4770 ///
4771 /// \param __a
4772 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4773 /// destination.
4774 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4775 /// the operand. The upper 64 bits are assigned zeros.
4776 static __inline__ __m128i __DEFAULT_FN_ATTRS
4777 _mm_move_epi64(__m128i __a)
4778 {
4779  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4780 }
4781 
4782 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4783 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4784 /// double].
4785 ///
4786 /// \headerfile <x86intrin.h>
4787 ///
4788 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4789 ///
4790 /// \param __a
4791 /// A 128-bit vector of [2 x double]. \n
4792 /// Bits [127:64] are written to bits [63:0] of the destination.
4793 /// \param __b
4794 /// A 128-bit vector of [2 x double]. \n
4795 /// Bits [127:64] are written to bits [127:64] of the destination.
4796 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4797 static __inline__ __m128d __DEFAULT_FN_ATTRS
4798 _mm_unpackhi_pd(__m128d __a, __m128d __b)
4799 {
4800  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
4801 }
4802 
4803 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4804 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4805 /// double].
4806 ///
4807 /// \headerfile <x86intrin.h>
4808 ///
4809 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4810 ///
4811 /// \param __a
4812 /// A 128-bit vector of [2 x double]. \n
4813 /// Bits [63:0] are written to bits [63:0] of the destination.
4814 /// \param __b
4815 /// A 128-bit vector of [2 x double]. \n
4816 /// Bits [63:0] are written to bits [127:64] of the destination.
4817 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4818 static __inline__ __m128d __DEFAULT_FN_ATTRS
4819 _mm_unpacklo_pd(__m128d __a, __m128d __b)
4820 {
4821  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
4822 }
4823 
4824 /// Extracts the sign bits of the double-precision values in the 128-bit
4825 /// vector of [2 x double], zero-extends the value, and writes it to the
4826 /// low-order bits of the destination.
4827 ///
4828 /// \headerfile <x86intrin.h>
4829 ///
4830 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4831 ///
4832 /// \param __a
4833 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4834 /// be extracted.
4835 /// \returns The sign bits from each of the double-precision elements in \a __a,
4836 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4837 static __inline__ int __DEFAULT_FN_ATTRS
4838 _mm_movemask_pd(__m128d __a)
4839 {
4840  return __builtin_ia32_movmskpd((__v2df)__a);
4841 }
4842 
4843 
4844 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4845 /// 128-bit vector parameters of [2 x double], using the immediate-value
4846 /// parameter as a specifier.
4847 ///
4848 /// \headerfile <x86intrin.h>
4849 ///
4850 /// \code
4851 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4852 /// \endcode
4853 ///
4854 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4855 ///
4856 /// \param a
4857 /// A 128-bit vector of [2 x double].
4858 /// \param b
4859 /// A 128-bit vector of [2 x double].
4860 /// \param i
4861 /// An 8-bit immediate value. The least significant two bits specify which
4862 /// elements to copy from \a a and \a b: \n
4863 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4864 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4865 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4866 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4867 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4868 #define _mm_shuffle_pd(a, b, i) \
4869  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4870  (int)(i))
4871 
4872 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4873 /// floating-point vector of [4 x float].
4874 ///
4875 /// \headerfile <x86intrin.h>
4876 ///
4877 /// This intrinsic has no corresponding instruction.
4878 ///
4879 /// \param __a
4880 /// A 128-bit floating-point vector of [2 x double].
4881 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4882 /// bitwise pattern as the parameter.
4883 static __inline__ __m128 __DEFAULT_FN_ATTRS
4884 _mm_castpd_ps(__m128d __a)
4885 {
4886  return (__m128)__a;
4887 }
4888 
4889 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4890 /// integer vector.
4891 ///
4892 /// \headerfile <x86intrin.h>
4893 ///
4894 /// This intrinsic has no corresponding instruction.
4895 ///
4896 /// \param __a
4897 /// A 128-bit floating-point vector of [2 x double].
4898 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4899 /// parameter.
4900 static __inline__ __m128i __DEFAULT_FN_ATTRS
4901 _mm_castpd_si128(__m128d __a)
4902 {
4903  return (__m128i)__a;
4904 }
4905 
4906 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4907 /// floating-point vector of [2 x double].
4908 ///
4909 /// \headerfile <x86intrin.h>
4910 ///
4911 /// This intrinsic has no corresponding instruction.
4912 ///
4913 /// \param __a
4914 /// A 128-bit floating-point vector of [4 x float].
4915 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4916 /// bitwise pattern as the parameter.
4917 static __inline__ __m128d __DEFAULT_FN_ATTRS
4918 _mm_castps_pd(__m128 __a)
4919 {
4920  return (__m128d)__a;
4921 }
4922 
4923 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4924 /// integer vector.
4925 ///
4926 /// \headerfile <x86intrin.h>
4927 ///
4928 /// This intrinsic has no corresponding instruction.
4929 ///
4930 /// \param __a
4931 /// A 128-bit floating-point vector of [4 x float].
4932 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4933 /// parameter.
4934 static __inline__ __m128i __DEFAULT_FN_ATTRS
4935 _mm_castps_si128(__m128 __a)
4936 {
4937  return (__m128i)__a;
4938 }
4939 
4940 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4941 /// of [4 x float].
4942 ///
4943 /// \headerfile <x86intrin.h>
4944 ///
4945 /// This intrinsic has no corresponding instruction.
4946 ///
4947 /// \param __a
4948 /// A 128-bit integer vector.
4949 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4950 /// bitwise pattern as the parameter.
4951 static __inline__ __m128 __DEFAULT_FN_ATTRS
4952 _mm_castsi128_ps(__m128i __a)
4953 {
4954  return (__m128)__a;
4955 }
4956 
4957 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4958 /// of [2 x double].
4959 ///
4960 /// \headerfile <x86intrin.h>
4961 ///
4962 /// This intrinsic has no corresponding instruction.
4963 ///
4964 /// \param __a
4965 /// A 128-bit integer vector.
4966 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4967 /// bitwise pattern as the parameter.
4968 static __inline__ __m128d __DEFAULT_FN_ATTRS
4969 _mm_castsi128_pd(__m128i __a)
4970 {
4971  return (__m128d)__a;
4972 }
4973 
4974 #if defined(__cplusplus)
4975 extern "C" {
4976 #endif
4977 
4978 /// Indicates that a spin loop is being executed for the purposes of
4979 /// optimizing power consumption during the loop.
4980 ///
4981 /// \headerfile <x86intrin.h>
4982 ///
4983 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4984 ///
4985 void _mm_pause(void);
4986 
4987 #if defined(__cplusplus)
4988 } // extern "C"
4989 #endif
4990 #undef __DEFAULT_FN_ATTRS
4991 #undef __DEFAULT_FN_ATTRS_MMX
4992 
4993 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4994 
4995 #define _MM_DENORMALS_ZERO_ON (0x0040)
4996 #define _MM_DENORMALS_ZERO_OFF (0x0000)
4997 
4998 #define _MM_DENORMALS_ZERO_MASK (0x0040)
4999 
5000 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
5001 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
5002 
5003 #endif /* __EMMINTRIN_H */
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:502
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2900
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3079
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1389
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void const *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4075
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1264
struct __storeu_i16 *__P __v
Definition: immintrin.h:318
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3721
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double]...
Definition: emmintrin.h:258
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3985
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2919
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2977
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1212
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1487
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1352
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value...
Definition: emmintrin.h:3807
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3117
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3907
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4325
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value...
Definition: emmintrin.h:3788
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1833
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1058
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2106
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4935
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1851
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2175
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2862
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:773
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1110
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1675
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2665
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:326
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3155
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1238
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4819
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1309
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3570
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2592
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1372
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1717
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2533
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1792
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2747
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4597
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3212
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3938
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:654
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location...
Definition: emmintrin.h:4126
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3337
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3653
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4952
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1290
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:282
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:70
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3864
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:799
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4759
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4742
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1559
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1655
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2957
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1611
static __inline__ void int __a
Definition: emmintrin.h:4207
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4884
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3358
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:698
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3631
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:38
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors...
Definition: emmintrin.h:2258
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location...
Definition: emmintrin.h:1986
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1186
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers...
Definition: emmintrin.h:4297
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:3017
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2628
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1542
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1738
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3295
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1910
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3483
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:675
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3554
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2068
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1925
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:131
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:29
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4918
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:195
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1414
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1329
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2215
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1869
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:723
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:481
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2647
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3467
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1889
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2938
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1160
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2434
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1593
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2997
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:955
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:89
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:827
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void const *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4054
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:981
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2131
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors...
Definition: emmintrin.h:2236
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3231
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4777
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:403
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1964
static __inline__ vector float vector float __b
Definition: altivec.h:534
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:930
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3251
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:612
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:215
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4725
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:4033
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:240
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3681
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3274
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:36
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2153
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4798
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand...
Definition: emmintrin.h:3591
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4187
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2454
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:2045
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1576
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:3037
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:112
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:364
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4901
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1136
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2514
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value...
Definition: emmintrin.h:1505
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4145
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2765
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value...
Definition: emmintrin.h:3518
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3769
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3136
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4704
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:905
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4969
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:302
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2394
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2414
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:546
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:421
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3884
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2300
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4653
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2086
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1437
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3999
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2552
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one&#39;s complement of the valu...
Definition: emmintrin.h:385
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4618
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:748
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2374
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value...
Definition: emmintrin.h:3826
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1522
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4681
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2707
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1006
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2474
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4353
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2727
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:4017
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3316
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3609
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors...
Definition: emmintrin.h:2574
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1084
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2494
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1696
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1032
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4547
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3174
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location...
Definition: emmintrin.h:4168
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value...
Definition: emmintrin.h:3845
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1946
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:855
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:523
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:880
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2344
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3098
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2881
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1765
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:591
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:51
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3450
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2610
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:440
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:346
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float]...
Definition: emmintrin.h:1463
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2820
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double]...
Definition: emmintrin.h:4838
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2686
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum...
Definition: emmintrin.h:2193
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2802
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void const *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:4096
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4421
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:172
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1637
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:2006
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3434
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:52
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2320
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4199
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2279
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1813
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:2027
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:633
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one&#39;s complement of the values conta...
Definition: emmintrin.h:2785
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3193
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4574
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:570