clang  16.0.0git
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24  __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 /* Define the default attributes for the functions in this file. */
42 #define __DEFAULT_FN_ATTRS \
43  __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
44  __min_vector_width__(128)))
45 #define __DEFAULT_FN_ATTRS_MMX \
46  __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \
47  __min_vector_width__(64)))
48 
49 /// Adds lower double-precision values in both operands and returns the
50 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
51 /// are copied from the upper double-precision value of the first operand.
52 ///
53 /// \headerfile <x86intrin.h>
54 ///
55 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
56 ///
57 /// \param __a
58 /// A 128-bit vector of [2 x double] containing one of the source operands.
59 /// \param __b
60 /// A 128-bit vector of [2 x double] containing one of the source operands.
61 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
62 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
63 /// from the upper 64 bits of the first source operand.
64 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
65  __m128d __b) {
66  __a[0] += __b[0];
67  return __a;
68 }
69 
70 /// Adds two 128-bit vectors of [2 x double].
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
75 ///
76 /// \param __a
77 /// A 128-bit vector of [2 x double] containing one of the source operands.
78 /// \param __b
79 /// A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] containing the sums of both
81 /// operands.
82 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
83  __m128d __b) {
84  return (__m128d)((__v2df)__a + (__v2df)__b);
85 }
86 
87 /// Subtracts the lower double-precision value of the second operand
88 /// from the lower double-precision value of the first operand and returns
89 /// the difference in the lower 64 bits of the result. The upper 64 bits of
90 /// the result are copied from the upper double-precision value of the first
91 /// operand.
92 ///
93 /// \headerfile <x86intrin.h>
94 ///
95 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
96 ///
97 /// \param __a
98 /// A 128-bit vector of [2 x double] containing the minuend.
99 /// \param __b
100 /// A 128-bit vector of [2 x double] containing the subtrahend.
101 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
102 /// difference of the lower 64 bits of both operands. The upper 64 bits are
103 /// copied from the upper 64 bits of the first source operand.
104 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
105  __m128d __b) {
106  __a[0] -= __b[0];
107  return __a;
108 }
109 
110 /// Subtracts two 128-bit vectors of [2 x double].
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
115 ///
116 /// \param __a
117 /// A 128-bit vector of [2 x double] containing the minuend.
118 /// \param __b
119 /// A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] containing the differences between
121 /// both operands.
122 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
123  __m128d __b) {
124  return (__m128d)((__v2df)__a - (__v2df)__b);
125 }
126 
127 /// Multiplies lower double-precision values in both operands and returns
128 /// the product in the lower 64 bits of the result. The upper 64 bits of the
129 /// result are copied from the upper double-precision value of the first
130 /// operand.
131 ///
132 /// \headerfile <x86intrin.h>
133 ///
134 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
135 ///
136 /// \param __a
137 /// A 128-bit vector of [2 x double] containing one of the source operands.
138 /// \param __b
139 /// A 128-bit vector of [2 x double] containing one of the source operands.
140 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
141 /// product of the lower 64 bits of both operands. The upper 64 bits are
142 /// copied from the upper 64 bits of the first source operand.
143 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
144  __m128d __b) {
145  __a[0] *= __b[0];
146  return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [2 x double].
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
154 ///
155 /// \param __a
156 /// A 128-bit vector of [2 x double] containing one of the operands.
157 /// \param __b
158 /// A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
160 /// operands.
161 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
162  __m128d __b) {
163  return (__m128d)((__v2df)__a * (__v2df)__b);
164 }
165 
166 /// Divides the lower double-precision value of the first operand by the
167 /// lower double-precision value of the second operand and returns the
168 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
169 /// result are copied from the upper double-precision value of the first
170 /// operand.
171 ///
172 /// \headerfile <x86intrin.h>
173 ///
174 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
175 ///
176 /// \param __a
177 /// A 128-bit vector of [2 x double] containing the dividend.
178 /// \param __b
179 /// A 128-bit vector of [2 x double] containing divisor.
180 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
181 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
182 /// copied from the upper 64 bits of the first source operand.
183 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
184  __m128d __b) {
185  __a[0] /= __b[0];
186  return __a;
187 }
188 
189 /// Performs an element-by-element division of two 128-bit vectors of
190 /// [2 x double].
191 ///
192 /// \headerfile <x86intrin.h>
193 ///
194 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
195 ///
196 /// \param __a
197 /// A 128-bit vector of [2 x double] containing the dividend.
198 /// \param __b
199 /// A 128-bit vector of [2 x double] containing the divisor.
200 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
201 /// operands.
202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
203  __m128d __b) {
204  return (__m128d)((__v2df)__a / (__v2df)__b);
205 }
206 
207 /// Calculates the square root of the lower double-precision value of
208 /// the second operand and returns it in the lower 64 bits of the result.
209 /// The upper 64 bits of the result are copied from the upper
210 /// double-precision value of the first operand.
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
215 ///
216 /// \param __a
217 /// A 128-bit vector of [2 x double] containing one of the operands. The
218 /// upper 64 bits of this operand are copied to the upper 64 bits of the
219 /// result.
220 /// \param __b
221 /// A 128-bit vector of [2 x double] containing one of the operands. The
222 /// square root is calculated using the lower 64 bits of this operand.
223 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
224 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
225 /// bits are copied from the upper 64 bits of operand \a __a.
226 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
227  __m128d __b) {
228  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
229  return __extension__(__m128d){__c[0], __a[1]};
230 }
231 
232 /// Calculates the square root of the each of two values stored in a
233 /// 128-bit vector of [2 x double].
234 ///
235 /// \headerfile <x86intrin.h>
236 ///
237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
238 ///
239 /// \param __a
240 /// A 128-bit vector of [2 x double].
241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
242 /// values in the operand.
243 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
244  return __builtin_ia32_sqrtpd((__v2df)__a);
245 }
246 
247 /// Compares lower 64-bit double-precision values of both operands, and
248 /// returns the lesser of the pair of values in the lower 64-bits of the
249 /// result. The upper 64 bits of the result are copied from the upper
250 /// double-precision value of the first operand.
251 ///
252 /// \headerfile <x86intrin.h>
253 ///
254 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
255 ///
256 /// \param __a
257 /// A 128-bit vector of [2 x double] containing one of the operands. The
258 /// lower 64 bits of this operand are used in the comparison.
259 /// \param __b
260 /// A 128-bit vector of [2 x double] containing one of the operands. The
261 /// lower 64 bits of this operand are used in the comparison.
262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
263 /// minimum value between both operands. The upper 64 bits are copied from
264 /// the upper 64 bits of the first source operand.
265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
266  __m128d __b) {
267  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
268 }
269 
270 /// Performs element-by-element comparison of the two 128-bit vectors of
271 /// [2 x double] and returns the vector containing the lesser of each pair of
272 /// values.
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
277 ///
278 /// \param __a
279 /// A 128-bit vector of [2 x double] containing one of the operands.
280 /// \param __b
281 /// A 128-bit vector of [2 x double] containing one of the operands.
282 /// \returns A 128-bit vector of [2 x double] containing the minimum values
283 /// between both operands.
284 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
285  __m128d __b) {
286  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
287 }
288 
289 /// Compares lower 64-bit double-precision values of both operands, and
290 /// returns the greater of the pair of values in the lower 64-bits of the
291 /// result. The upper 64 bits of the result are copied from the upper
292 /// double-precision value of the first operand.
293 ///
294 /// \headerfile <x86intrin.h>
295 ///
296 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
297 ///
298 /// \param __a
299 /// A 128-bit vector of [2 x double] containing one of the operands. The
300 /// lower 64 bits of this operand are used in the comparison.
301 /// \param __b
302 /// A 128-bit vector of [2 x double] containing one of the operands. The
303 /// lower 64 bits of this operand are used in the comparison.
304 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
305 /// maximum value between both operands. The upper 64 bits are copied from
306 /// the upper 64 bits of the first source operand.
307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
308  __m128d __b) {
309  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
310 }
311 
312 /// Performs element-by-element comparison of the two 128-bit vectors of
313 /// [2 x double] and returns the vector containing the greater of each pair
314 /// of values.
315 ///
316 /// \headerfile <x86intrin.h>
317 ///
318 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
319 ///
320 /// \param __a
321 /// A 128-bit vector of [2 x double] containing one of the operands.
322 /// \param __b
323 /// A 128-bit vector of [2 x double] containing one of the operands.
324 /// \returns A 128-bit vector of [2 x double] containing the maximum values
325 /// between both operands.
326 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
327  __m128d __b) {
328  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
329 }
330 
331 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
332 ///
333 /// \headerfile <x86intrin.h>
334 ///
335 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
336 ///
337 /// \param __a
338 /// A 128-bit vector of [2 x double] containing one of the source operands.
339 /// \param __b
340 /// A 128-bit vector of [2 x double] containing one of the source operands.
341 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
342 /// values between both operands.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
344  __m128d __b) {
345  return (__m128d)((__v2du)__a & (__v2du)__b);
346 }
347 
348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
349 /// the one's complement of the values contained in the first source operand.
350 ///
351 /// \headerfile <x86intrin.h>
352 ///
353 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
354 ///
355 /// \param __a
356 /// A 128-bit vector of [2 x double] containing the left source operand. The
357 /// one's complement of this value is used in the bitwise AND.
358 /// \param __b
359 /// A 128-bit vector of [2 x double] containing the right source operand.
360 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
361 /// values in the second operand and the one's complement of the first
362 /// operand.
363 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
364  __m128d __b) {
365  return (__m128d)(~(__v2du)__a & (__v2du)__b);
366 }
367 
368 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
369 ///
370 /// \headerfile <x86intrin.h>
371 ///
372 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
373 ///
374 /// \param __a
375 /// A 128-bit vector of [2 x double] containing one of the source operands.
376 /// \param __b
377 /// A 128-bit vector of [2 x double] containing one of the source operands.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
379 /// values between both operands.
380 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
381  __m128d __b) {
382  return (__m128d)((__v2du)__a | (__v2du)__b);
383 }
384 
385 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
386 ///
387 /// \headerfile <x86intrin.h>
388 ///
389 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
390 ///
391 /// \param __a
392 /// A 128-bit vector of [2 x double] containing one of the source operands.
393 /// \param __b
394 /// A 128-bit vector of [2 x double] containing one of the source operands.
395 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
396 /// values between both operands.
397 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
398  __m128d __b) {
399  return (__m128d)((__v2du)__a ^ (__v2du)__b);
400 }
401 
402 /// Compares each of the corresponding double-precision values of the
403 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
404 /// for false, 0xFFFFFFFFFFFFFFFF for true.
405 ///
406 /// \headerfile <x86intrin.h>
407 ///
408 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
409 ///
410 /// \param __a
411 /// A 128-bit vector of [2 x double].
412 /// \param __b
413 /// A 128-bit vector of [2 x double].
414 /// \returns A 128-bit vector containing the comparison results.
415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
416  __m128d __b) {
417  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
418 }
419 
420 /// Compares each of the corresponding double-precision values of the
421 /// 128-bit vectors of [2 x double] to determine if the values in the first
422 /// operand are less than those in the second operand. Each comparison
423 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
424 ///
425 /// \headerfile <x86intrin.h>
426 ///
427 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
428 ///
429 /// \param __a
430 /// A 128-bit vector of [2 x double].
431 /// \param __b
432 /// A 128-bit vector of [2 x double].
433 /// \returns A 128-bit vector containing the comparison results.
434 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
435  __m128d __b) {
436  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
437 }
438 
439 /// Compares each of the corresponding double-precision values of the
440 /// 128-bit vectors of [2 x double] to determine if the values in the first
441 /// operand are less than or equal to those in the second operand.
442 ///
443 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
444 ///
445 /// \headerfile <x86intrin.h>
446 ///
447 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
448 ///
449 /// \param __a
450 /// A 128-bit vector of [2 x double].
451 /// \param __b
452 /// A 128-bit vector of [2 x double].
453 /// \returns A 128-bit vector containing the comparison results.
454 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
455  __m128d __b) {
456  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
457 }
458 
459 /// Compares each of the corresponding double-precision values of the
460 /// 128-bit vectors of [2 x double] to determine if the values in the first
461 /// operand are greater than those in the second operand.
462 ///
463 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
464 ///
465 /// \headerfile <x86intrin.h>
466 ///
467 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
468 ///
469 /// \param __a
470 /// A 128-bit vector of [2 x double].
471 /// \param __b
472 /// A 128-bit vector of [2 x double].
473 /// \returns A 128-bit vector containing the comparison results.
474 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
475  __m128d __b) {
476  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
477 }
478 
479 /// Compares each of the corresponding double-precision values of the
480 /// 128-bit vectors of [2 x double] to determine if the values in the first
481 /// operand are greater than or equal to those in the second operand.
482 ///
483 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
484 ///
485 /// \headerfile <x86intrin.h>
486 ///
487 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
488 ///
489 /// \param __a
490 /// A 128-bit vector of [2 x double].
491 /// \param __b
492 /// A 128-bit vector of [2 x double].
493 /// \returns A 128-bit vector containing the comparison results.
494 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
495  __m128d __b) {
496  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
497 }
498 
499 /// Compares each of the corresponding double-precision values of the
500 /// 128-bit vectors of [2 x double] to determine if the values in the first
501 /// operand are ordered with respect to those in the second operand.
502 ///
503 /// A pair of double-precision values are "ordered" with respect to each
504 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
505 /// 0xFFFFFFFFFFFFFFFF for true.
506 ///
507 /// \headerfile <x86intrin.h>
508 ///
509 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
510 ///
511 /// \param __a
512 /// A 128-bit vector of [2 x double].
513 /// \param __b
514 /// A 128-bit vector of [2 x double].
515 /// \returns A 128-bit vector containing the comparison results.
516 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
517  __m128d __b) {
518  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
519 }
520 
521 /// Compares each of the corresponding double-precision values of the
522 /// 128-bit vectors of [2 x double] to determine if the values in the first
523 /// operand are unordered with respect to those in the second operand.
524 ///
525 /// A pair of double-precision values are "unordered" with respect to each
526 /// other if one or both values are NaN. Each comparison yields 0x0 for
527 /// false, 0xFFFFFFFFFFFFFFFF for true.
528 ///
529 /// \headerfile <x86intrin.h>
530 ///
531 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
532 /// instruction.
533 ///
534 /// \param __a
535 /// A 128-bit vector of [2 x double].
536 /// \param __b
537 /// A 128-bit vector of [2 x double].
538 /// \returns A 128-bit vector containing the comparison results.
539 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
540  __m128d __b) {
541  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
542 }
543 
544 /// Compares each of the corresponding double-precision values of the
545 /// 128-bit vectors of [2 x double] to determine if the values in the first
546 /// operand are unequal to those in the second operand.
547 ///
548 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
549 ///
550 /// \headerfile <x86intrin.h>
551 ///
552 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
553 ///
554 /// \param __a
555 /// A 128-bit vector of [2 x double].
556 /// \param __b
557 /// A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
560  __m128d __b) {
561  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
562 }
563 
564 /// Compares each of the corresponding double-precision values of the
565 /// 128-bit vectors of [2 x double] to determine if the values in the first
566 /// operand are not less than those in the second operand.
567 ///
568 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
569 ///
570 /// \headerfile <x86intrin.h>
571 ///
572 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
573 ///
574 /// \param __a
575 /// A 128-bit vector of [2 x double].
576 /// \param __b
577 /// A 128-bit vector of [2 x double].
578 /// \returns A 128-bit vector containing the comparison results.
579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
580  __m128d __b) {
581  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
582 }
583 
584 /// Compares each of the corresponding double-precision values of the
585 /// 128-bit vectors of [2 x double] to determine if the values in the first
586 /// operand are not less than or equal to those in the second operand.
587 ///
588 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
589 ///
590 /// \headerfile <x86intrin.h>
591 ///
592 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
593 ///
594 /// \param __a
595 /// A 128-bit vector of [2 x double].
596 /// \param __b
597 /// A 128-bit vector of [2 x double].
598 /// \returns A 128-bit vector containing the comparison results.
599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
600  __m128d __b) {
601  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
602 }
603 
604 /// Compares each of the corresponding double-precision values of the
605 /// 128-bit vectors of [2 x double] to determine if the values in the first
606 /// operand are not greater than those in the second operand.
607 ///
608 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
609 ///
610 /// \headerfile <x86intrin.h>
611 ///
612 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
613 ///
614 /// \param __a
615 /// A 128-bit vector of [2 x double].
616 /// \param __b
617 /// A 128-bit vector of [2 x double].
618 /// \returns A 128-bit vector containing the comparison results.
619 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
620  __m128d __b) {
621  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
622 }
623 
624 /// Compares each of the corresponding double-precision values of the
625 /// 128-bit vectors of [2 x double] to determine if the values in the first
626 /// operand are not greater than or equal to those in the second operand.
627 ///
628 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
629 ///
630 /// \headerfile <x86intrin.h>
631 ///
632 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
633 ///
634 /// \param __a
635 /// A 128-bit vector of [2 x double].
636 /// \param __b
637 /// A 128-bit vector of [2 x double].
638 /// \returns A 128-bit vector containing the comparison results.
639 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
640  __m128d __b) {
641  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
642 }
643 
644 /// Compares the lower double-precision floating-point values in each of
645 /// the two 128-bit floating-point vectors of [2 x double] for equality.
646 ///
647 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
648 ///
649 /// \headerfile <x86intrin.h>
650 ///
651 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
652 ///
653 /// \param __a
654 /// A 128-bit vector of [2 x double]. The lower double-precision value is
655 /// compared to the lower double-precision value of \a __b.
656 /// \param __b
657 /// A 128-bit vector of [2 x double]. The lower double-precision value is
658 /// compared to the lower double-precision value of \a __a.
659 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
660 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
662  __m128d __b) {
663  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
664 }
665 
666 /// Compares the lower double-precision floating-point values in each of
667 /// the two 128-bit floating-point vectors of [2 x double] to determine if
668 /// the value in the first parameter is less than the corresponding value in
669 /// the second parameter.
670 ///
671 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
672 ///
673 /// \headerfile <x86intrin.h>
674 ///
675 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
676 ///
677 /// \param __a
678 /// A 128-bit vector of [2 x double]. The lower double-precision value is
679 /// compared to the lower double-precision value of \a __b.
680 /// \param __b
681 /// A 128-bit vector of [2 x double]. The lower double-precision value is
682 /// compared to the lower double-precision value of \a __a.
683 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
684 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
685 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
686  __m128d __b) {
687  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
688 }
689 
690 /// Compares the lower double-precision floating-point values in each of
691 /// the two 128-bit floating-point vectors of [2 x double] to determine if
692 /// the value in the first parameter is less than or equal to the
693 /// corresponding value in the second parameter.
694 ///
695 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
696 ///
697 /// \headerfile <x86intrin.h>
698 ///
699 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
700 ///
701 /// \param __a
702 /// A 128-bit vector of [2 x double]. The lower double-precision value is
703 /// compared to the lower double-precision value of \a __b.
704 /// \param __b
705 /// A 128-bit vector of [2 x double]. The lower double-precision value is
706 /// compared to the lower double-precision value of \a __a.
707 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
708 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
710  __m128d __b) {
711  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
712 }
713 
714 /// Compares the lower double-precision floating-point values in each of
715 /// the two 128-bit floating-point vectors of [2 x double] to determine if
716 /// the value in the first parameter is greater than the corresponding value
717 /// in the second parameter.
718 ///
719 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
720 ///
721 /// \headerfile <x86intrin.h>
722 ///
723 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
724 ///
725 /// \param __a
726 /// A 128-bit vector of [2 x double]. The lower double-precision value is
727 /// compared to the lower double-precision value of \a __b.
728 /// \param __b
729 /// A 128-bit vector of [2 x double]. The lower double-precision value is
730 /// compared to the lower double-precision value of \a __a.
731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
732 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
733 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
734  __m128d __b) {
735  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
736  return __extension__(__m128d){__c[0], __a[1]};
737 }
738 
739 /// Compares the lower double-precision floating-point values in each of
740 /// the two 128-bit floating-point vectors of [2 x double] to determine if
741 /// the value in the first parameter is greater than or equal to the
742 /// corresponding value in the second parameter.
743 ///
744 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
745 ///
746 /// \headerfile <x86intrin.h>
747 ///
748 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
749 ///
750 /// \param __a
751 /// A 128-bit vector of [2 x double]. The lower double-precision value is
752 /// compared to the lower double-precision value of \a __b.
753 /// \param __b
754 /// A 128-bit vector of [2 x double]. The lower double-precision value is
755 /// compared to the lower double-precision value of \a __a.
756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
757 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
758 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
759  __m128d __b) {
760  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
761  return __extension__(__m128d){__c[0], __a[1]};
762 }
763 
764 /// Compares the lower double-precision floating-point values in each of
765 /// the two 128-bit floating-point vectors of [2 x double] to determine if
766 /// the value in the first parameter is "ordered" with respect to the
767 /// corresponding value in the second parameter.
768 ///
769 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
770 /// of double-precision values are "ordered" with respect to each other if
771 /// neither value is a NaN.
772 ///
773 /// \headerfile <x86intrin.h>
774 ///
775 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
776 ///
777 /// \param __a
778 /// A 128-bit vector of [2 x double]. The lower double-precision value is
779 /// compared to the lower double-precision value of \a __b.
780 /// \param __b
781 /// A 128-bit vector of [2 x double]. The lower double-precision value is
782 /// compared to the lower double-precision value of \a __a.
783 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
784 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
785 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
786  __m128d __b) {
787  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
788 }
789 
790 /// Compares the lower double-precision floating-point values in each of
791 /// the two 128-bit floating-point vectors of [2 x double] to determine if
792 /// the value in the first parameter is "unordered" with respect to the
793 /// corresponding value in the second parameter.
794 ///
795 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
796 /// of double-precision values are "unordered" with respect to each other if
797 /// one or both values are NaN.
798 ///
799 /// \headerfile <x86intrin.h>
800 ///
801 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
802 /// instruction.
803 ///
804 /// \param __a
805 /// A 128-bit vector of [2 x double]. The lower double-precision value is
806 /// compared to the lower double-precision value of \a __b.
807 /// \param __b
808 /// A 128-bit vector of [2 x double]. The lower double-precision value is
809 /// compared to the lower double-precision value of \a __a.
810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
811 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
812 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
813  __m128d __b) {
814  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
815 }
816 
817 /// Compares the lower double-precision floating-point values in each of
818 /// the two 128-bit floating-point vectors of [2 x double] to determine if
819 /// the value in the first parameter is unequal to the corresponding value in
820 /// the second parameter.
821 ///
822 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
823 ///
824 /// \headerfile <x86intrin.h>
825 ///
826 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
827 ///
828 /// \param __a
829 /// A 128-bit vector of [2 x double]. The lower double-precision value is
830 /// compared to the lower double-precision value of \a __b.
831 /// \param __b
832 /// A 128-bit vector of [2 x double]. The lower double-precision value is
833 /// compared to the lower double-precision value of \a __a.
834 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
835 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
836 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
837  __m128d __b) {
838  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
839 }
840 
841 /// Compares the lower double-precision floating-point values in each of
842 /// the two 128-bit floating-point vectors of [2 x double] to determine if
843 /// the value in the first parameter is not less than the corresponding
844 /// value in the second parameter.
845 ///
846 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
847 ///
848 /// \headerfile <x86intrin.h>
849 ///
850 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
851 ///
852 /// \param __a
853 /// A 128-bit vector of [2 x double]. The lower double-precision value is
854 /// compared to the lower double-precision value of \a __b.
855 /// \param __b
856 /// A 128-bit vector of [2 x double]. The lower double-precision value is
857 /// compared to the lower double-precision value of \a __a.
858 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
859 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
860 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
861  __m128d __b) {
862  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
863 }
864 
865 /// Compares the lower double-precision floating-point values in each of
866 /// the two 128-bit floating-point vectors of [2 x double] to determine if
867 /// the value in the first parameter is not less than or equal to the
868 /// corresponding value in the second parameter.
869 ///
870 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
871 ///
872 /// \headerfile <x86intrin.h>
873 ///
874 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
875 ///
876 /// \param __a
877 /// A 128-bit vector of [2 x double]. The lower double-precision value is
878 /// compared to the lower double-precision value of \a __b.
879 /// \param __b
880 /// A 128-bit vector of [2 x double]. The lower double-precision value is
881 /// compared to the lower double-precision value of \a __a.
882 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
883 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
884 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
885  __m128d __b) {
886  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
887 }
888 
889 /// Compares the lower double-precision floating-point values in each of
890 /// the two 128-bit floating-point vectors of [2 x double] to determine if
891 /// the value in the first parameter is not greater than the corresponding
892 /// value in the second parameter.
893 ///
894 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
895 ///
896 /// \headerfile <x86intrin.h>
897 ///
898 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
899 ///
900 /// \param __a
901 /// A 128-bit vector of [2 x double]. The lower double-precision value is
902 /// compared to the lower double-precision value of \a __b.
903 /// \param __b
904 /// A 128-bit vector of [2 x double]. The lower double-precision value is
905 /// compared to the lower double-precision value of \a __a.
906 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
907 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
908 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
909  __m128d __b) {
910  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
911  return __extension__(__m128d){__c[0], __a[1]};
912 }
913 
914 /// Compares the lower double-precision floating-point values in each of
915 /// the two 128-bit floating-point vectors of [2 x double] to determine if
916 /// the value in the first parameter is not greater than or equal to the
917 /// corresponding value in the second parameter.
918 ///
919 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
920 ///
921 /// \headerfile <x86intrin.h>
922 ///
923 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
924 ///
925 /// \param __a
926 /// A 128-bit vector of [2 x double]. The lower double-precision value is
927 /// compared to the lower double-precision value of \a __b.
928 /// \param __b
929 /// A 128-bit vector of [2 x double]. The lower double-precision value is
930 /// compared to the lower double-precision value of \a __a.
931 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
932 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
933 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
934  __m128d __b) {
935  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
936  return __extension__(__m128d){__c[0], __a[1]};
937 }
938 
939 /// Compares the lower double-precision floating-point values in each of
940 /// the two 128-bit floating-point vectors of [2 x double] for equality.
941 ///
942 /// The comparison yields 0 for false, 1 for true. If either of the two
943 /// lower double-precision values is NaN, 0 is returned.
944 ///
945 /// \headerfile <x86intrin.h>
946 ///
947 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
948 ///
949 /// \param __a
950 /// A 128-bit vector of [2 x double]. The lower double-precision value is
951 /// compared to the lower double-precision value of \a __b.
952 /// \param __b
953 /// A 128-bit vector of [2 x double]. The lower double-precision value is
954 /// compared to the lower double-precision value of \a __a.
955 /// \returns An integer containing the comparison results. If either of the two
956 /// lower double-precision values is NaN, 0 is returned.
957 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
958  __m128d __b) {
959  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
960 }
961 
962 /// Compares the lower double-precision floating-point values in each of
963 /// the two 128-bit floating-point vectors of [2 x double] to determine if
964 /// the value in the first parameter is less than the corresponding value in
965 /// the second parameter.
966 ///
967 /// The comparison yields 0 for false, 1 for true. If either of the two
968 /// lower double-precision values is NaN, 0 is returned.
969 ///
970 /// \headerfile <x86intrin.h>
971 ///
972 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
973 ///
974 /// \param __a
975 /// A 128-bit vector of [2 x double]. The lower double-precision value is
976 /// compared to the lower double-precision value of \a __b.
977 /// \param __b
978 /// A 128-bit vector of [2 x double]. The lower double-precision value is
979 /// compared to the lower double-precision value of \a __a.
980 /// \returns An integer containing the comparison results. If either of the two
981 /// lower double-precision values is NaN, 0 is returned.
982 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
983  __m128d __b) {
984  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
985 }
986 
987 /// Compares the lower double-precision floating-point values in each of
988 /// the two 128-bit floating-point vectors of [2 x double] to determine if
989 /// the value in the first parameter is less than or equal to the
990 /// corresponding value in the second parameter.
991 ///
992 /// The comparison yields 0 for false, 1 for true. If either of the two
993 /// lower double-precision values is NaN, 0 is returned.
994 ///
995 /// \headerfile <x86intrin.h>
996 ///
997 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
998 ///
999 /// \param __a
1000 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1001 /// compared to the lower double-precision value of \a __b.
1002 /// \param __b
1003 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1004 /// compared to the lower double-precision value of \a __a.
1005 /// \returns An integer containing the comparison results. If either of the two
1006 /// lower double-precision values is NaN, 0 is returned.
1007 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1008  __m128d __b) {
1009  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1010 }
1011 
1012 /// Compares the lower double-precision floating-point values in each of
1013 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1014 /// the value in the first parameter is greater than the corresponding value
1015 /// in the second parameter.
1016 ///
1017 /// The comparison yields 0 for false, 1 for true. If either of the two
1018 /// lower double-precision values is NaN, 0 is returned.
1019 ///
1020 /// \headerfile <x86intrin.h>
1021 ///
1022 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1023 ///
1024 /// \param __a
1025 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1026 /// compared to the lower double-precision value of \a __b.
1027 /// \param __b
1028 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1029 /// compared to the lower double-precision value of \a __a.
1030 /// \returns An integer containing the comparison results. If either of the two
1031 /// lower double-precision values is NaN, 0 is returned.
1032 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1033  __m128d __b) {
1034  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1035 }
1036 
1037 /// Compares the lower double-precision floating-point values in each of
1038 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1039 /// the value in the first parameter is greater than or equal to the
1040 /// corresponding value in the second parameter.
1041 ///
1042 /// The comparison yields 0 for false, 1 for true. If either of the two
1043 /// lower double-precision values is NaN, 0 is returned.
1044 ///
1045 /// \headerfile <x86intrin.h>
1046 ///
1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1048 ///
1049 /// \param __a
1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1051 /// compared to the lower double-precision value of \a __b.
1052 /// \param __b
1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1054 /// compared to the lower double-precision value of \a __a.
1055 /// \returns An integer containing the comparison results. If either of the two
1056 /// lower double-precision values is NaN, 0 is returned.
1057 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1058  __m128d __b) {
1059  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1060 }
1061 
1062 /// Compares the lower double-precision floating-point values in each of
1063 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1064 /// the value in the first parameter is unequal to the corresponding value in
1065 /// the second parameter.
1066 ///
1067 /// The comparison yields 0 for false, 1 for true. If either of the two
1068 /// lower double-precision values is NaN, 1 is returned.
1069 ///
1070 /// \headerfile <x86intrin.h>
1071 ///
1072 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1073 ///
1074 /// \param __a
1075 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1076 /// compared to the lower double-precision value of \a __b.
1077 /// \param __b
1078 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1079 /// compared to the lower double-precision value of \a __a.
1080 /// \returns An integer containing the comparison results. If either of the two
1081 /// lower double-precision values is NaN, 1 is returned.
1082 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1083  __m128d __b) {
1084  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1085 }
1086 
1087 /// Compares the lower double-precision floating-point values in each of
1088 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1089 /// comparison yields 0 for false, 1 for true.
1090 ///
1091 /// If either of the two lower double-precision values is NaN, 0 is returned.
1092 ///
1093 /// \headerfile <x86intrin.h>
1094 ///
1095 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1096 ///
1097 /// \param __a
1098 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1099 /// compared to the lower double-precision value of \a __b.
1100 /// \param __b
1101 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1102 /// compared to the lower double-precision value of \a __a.
1103 /// \returns An integer containing the comparison results. If either of the two
1104 /// lower double-precision values is NaN, 0 is returned.
1105 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1106  __m128d __b) {
1107  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1108 }
1109 
1110 /// Compares the lower double-precision floating-point values in each of
1111 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1112 /// the value in the first parameter is less than the corresponding value in
1113 /// the second parameter.
1114 ///
1115 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1116 /// double-precision values is NaN, 0 is returned.
1117 ///
1118 /// \headerfile <x86intrin.h>
1119 ///
1120 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1121 ///
1122 /// \param __a
1123 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1124 /// compared to the lower double-precision value of \a __b.
1125 /// \param __b
1126 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1127 /// compared to the lower double-precision value of \a __a.
1128 /// \returns An integer containing the comparison results. If either of the two
1129 /// lower double-precision values is NaN, 0 is returned.
1130 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1131  __m128d __b) {
1132  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1133 }
1134 
1135 /// Compares the lower double-precision floating-point values in each of
1136 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1137 /// the value in the first parameter is less than or equal to the
1138 /// corresponding value in the second parameter.
1139 ///
1140 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1141 /// double-precision values is NaN, 0 is returned.
1142 ///
1143 /// \headerfile <x86intrin.h>
1144 ///
1145 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1146 ///
1147 /// \param __a
1148 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1149 /// compared to the lower double-precision value of \a __b.
1150 /// \param __b
1151 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1152 /// compared to the lower double-precision value of \a __a.
1153 /// \returns An integer containing the comparison results. If either of the two
1154 /// lower double-precision values is NaN, 0 is returned.
1155 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1156  __m128d __b) {
1157  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1158 }
1159 
1160 /// Compares the lower double-precision floating-point values in each of
1161 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1162 /// the value in the first parameter is greater than the corresponding value
1163 /// in the second parameter.
1164 ///
1165 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1166 /// double-precision values is NaN, 0 is returned.
1167 ///
1168 /// \headerfile <x86intrin.h>
1169 ///
1170 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1171 ///
1172 /// \param __a
1173 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1174 /// compared to the lower double-precision value of \a __b.
1175 /// \param __b
1176 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1177 /// compared to the lower double-precision value of \a __a.
1178 /// \returns An integer containing the comparison results. If either of the two
1179 /// lower double-precision values is NaN, 0 is returned.
1180 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1181  __m128d __b) {
1182  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1183 }
1184 
1185 /// Compares the lower double-precision floating-point values in each of
1186 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1187 /// the value in the first parameter is greater than or equal to the
1188 /// corresponding value in the second parameter.
1189 ///
1190 /// The comparison yields 0 for false, 1 for true. If either of the two
1191 /// lower double-precision values is NaN, 0 is returned.
1192 ///
1193 /// \headerfile <x86intrin.h>
1194 ///
1195 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1196 ///
1197 /// \param __a
1198 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1199 /// compared to the lower double-precision value of \a __b.
1200 /// \param __b
1201 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1202 /// compared to the lower double-precision value of \a __a.
1203 /// \returns An integer containing the comparison results. If either of the two
1204 /// lower double-precision values is NaN, 0 is returned.
1205 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1206  __m128d __b) {
1207  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1208 }
1209 
1210 /// Compares the lower double-precision floating-point values in each of
1211 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1212 /// the value in the first parameter is unequal to the corresponding value in
1213 /// the second parameter.
1214 ///
1215 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1216 /// double-precision values is NaN, 1 is returned.
1217 ///
1218 /// \headerfile <x86intrin.h>
1219 ///
1220 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1221 ///
1222 /// \param __a
1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1224 /// compared to the lower double-precision value of \a __b.
1225 /// \param __b
1226 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1227 /// compared to the lower double-precision value of \a __a.
1228 /// \returns An integer containing the comparison result. If either of the two
1229 /// lower double-precision values is NaN, 1 is returned.
1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1231  __m128d __b) {
1232  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1233 }
1234 
1235 /// Converts the two double-precision floating-point elements of a
1236 /// 128-bit vector of [2 x double] into two single-precision floating-point
1237 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 /// The upper 64 bits of the result vector are set to zero.
1239 ///
1240 /// \headerfile <x86intrin.h>
1241 ///
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1243 ///
1244 /// \param __a
1245 /// A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 /// converted values. The upper 64 bits are set to zero.
1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1249  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1250 }
1251 
1252 /// Converts the lower two single-precision floating-point elements of a
1253 /// 128-bit vector of [4 x float] into two double-precision floating-point
1254 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1255 /// elements of the input vector are unused.
1256 ///
1257 /// \headerfile <x86intrin.h>
1258 ///
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1260 ///
1261 /// \param __a
1262 /// A 128-bit vector of [4 x float]. The lower two single-precision
1263 /// floating-point elements are converted to double-precision values. The
1264 /// upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1267  return (__m128d) __builtin_convertvector(
1268  __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1269 }
1270 
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 /// [4 x i32] into two double-precision floating-point values, returned in a
1273 /// 128-bit vector of [2 x double].
1274 ///
1275 /// The upper two elements of the input vector are unused.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1280 ///
1281 /// \param __a
1282 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 /// converted to double-precision values.
1284 ///
1285 /// The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1288  return (__m128d) __builtin_convertvector(
1289  __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1290 }
1291 
1292 /// Converts the two double-precision floating-point elements of a
1293 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 /// 64 bits of the result vector are set to zero.
1296 ///
1297 /// \headerfile <x86intrin.h>
1298 ///
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1300 ///
1301 /// \param __a
1302 /// A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 /// converted values. The upper 64 bits are set to zero.
1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1306  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1307 }
1308 
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 /// into a 32-bit signed integer value.
1311 ///
1312 /// \headerfile <x86intrin.h>
1313 ///
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1315 ///
1316 /// \param __a
1317 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1318 /// conversion.
1319 /// \returns A 32-bit signed integer containing the converted value.
1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1321  return __builtin_ia32_cvtsd2si((__v2df)__a);
1322 }
1323 
1324 /// Converts the lower double-precision floating-point element of a
1325 /// 128-bit vector of [2 x double], in the second parameter, into a
1326 /// single-precision floating-point value, returned in the lower 32 bits of a
1327 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 /// copied from the upper 96 bits of the first parameter.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1333 ///
1334 /// \param __a
1335 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 /// copied to the upper 96 bits of the result.
1337 /// \param __b
1338 /// A 128-bit vector of [2 x double]. The lower double-precision
1339 /// floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 /// converted value from the second parameter. The upper 96 bits are copied
1342 /// from the upper 96 bits of the first parameter.
1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1344  __m128d __b) {
1345  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1346 }
1347 
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 /// a double-precision floating-point value, returned in the lower 64 bits of
1350 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 /// are copied from the upper 64 bits of the first parameter.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1356 ///
1357 /// \param __a
1358 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 /// copied to the upper 64 bits of the result.
1360 /// \param __b
1361 /// A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 /// converted value from the second parameter. The upper 64 bits are copied
1364 /// from the upper 64 bits of the first parameter.
1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1366  int __b) {
1367  __a[0] = __b;
1368  return __a;
1369 }
1370 
1371 /// Converts the lower single-precision floating-point element of a
1372 /// 128-bit vector of [4 x float], in the second parameter, into a
1373 /// double-precision floating-point value, returned in the lower 64 bits of
1374 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 /// are copied from the upper 64 bits of the first parameter.
1376 ///
1377 /// \headerfile <x86intrin.h>
1378 ///
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1380 ///
1381 /// \param __a
1382 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 /// copied to the upper 64 bits of the result.
1384 /// \param __b
1385 /// A 128-bit vector of [4 x float]. The lower single-precision
1386 /// floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 /// converted value from the second parameter. The upper 64 bits are copied
1389 /// from the upper 64 bits of the first parameter.
1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1391  __m128 __b) {
1392  __a[0] = __b[0];
1393  return __a;
1394 }
1395 
1396 /// Converts the two double-precision floating-point elements of a
1397 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1399 ///
1400 /// If the result of either conversion is inexact, the result is truncated
1401 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 /// 64 bits of the result vector are set to zero.
1403 ///
1404 /// \headerfile <x86intrin.h>
1405 ///
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1407 /// instruction.
1408 ///
1409 /// \param __a
1410 /// A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 /// converted values. The upper 64 bits are set to zero.
1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1414  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1415 }
1416 
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 /// signed integer value, truncating the result when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1423 /// instruction.
1424 ///
1425 /// \param __a
1426 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1427 /// conversion.
1428 /// \returns A 32-bit signed integer containing the converted value.
1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1430  return __builtin_ia32_cvttsd2si((__v2df)__a);
1431 }
1432 
1433 /// Converts the two double-precision floating-point elements of a
1434 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 /// returned in a 64-bit vector of [2 x i32].
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1440 ///
1441 /// \param __a
1442 /// A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1445  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1446 }
1447 
1448 /// Converts the two double-precision floating-point elements of a
1449 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 /// returned in a 64-bit vector of [2 x i32].
1451 ///
1452 /// If the result of either conversion is inexact, the result is truncated
1453 /// (rounded towards zero) regardless of the current MXCSR setting.
1454 ///
1455 /// \headerfile <x86intrin.h>
1456 ///
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1458 ///
1459 /// \param __a
1460 /// A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1463  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1464 }
1465 
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 /// [2 x i32] into two double-precision floating-point values, returned in a
1468 /// 128-bit vector of [2 x double].
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1473 ///
1474 /// \param __a
1475 /// A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1478  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1479 }
1480 
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 /// a double-precision floating-point value.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic has no corresponding instruction.
1487 ///
1488 /// \param __a
1489 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1491 /// bits of \a __a.
1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1493  return __a[0];
1494 }
1495 
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 /// memory location.
1498 ///
1499 /// \headerfile <x86intrin.h>
1500 ///
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1502 ///
1503 /// \param __dp
1504 /// A pointer to a 128-bit memory location. The address of the memory
1505 /// location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1508  return *(const __m128d *)__dp;
1509 }
1510 
1511 /// Loads a double-precision floating-point value from a specified memory
1512 /// location and duplicates it to both vector elements of a 128-bit vector of
1513 /// [2 x double].
1514 ///
1515 /// \headerfile <x86intrin.h>
1516 ///
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1518 ///
1519 /// \param __dp
1520 /// A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 /// duplicated values.
1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1524  struct __mm_load1_pd_struct {
1525  double __u;
1526  } __attribute__((__packed__, __may_alias__));
1527  double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1528  return __extension__(__m128d){__u, __u};
1529 }
1530 
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1532 
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 /// memory location into a 128-bit vector of [2 x double].
1535 ///
1536 /// \headerfile <x86intrin.h>
1537 ///
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1541 ///
1542 /// \param __dp
1543 /// A 16-byte aligned pointer to an array of double-precision values to be
1544 /// loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1546 /// values.
1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1548  __m128d __u = *(const __m128d *)__dp;
1549  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1550 }
1551 
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 /// unaligned memory location.
1554 ///
1555 /// \headerfile <x86intrin.h>
1556 ///
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1558 ///
1559 /// \param __dp
1560 /// A pointer to a 128-bit memory location. The address of the memory
1561 /// location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1564  struct __loadu_pd {
1565  __m128d_u __v;
1566  } __attribute__((__packed__, __may_alias__));
1567  return ((const struct __loadu_pd *)__dp)->__v;
1568 }
1569 
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 /// vector and clears the upper element.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1576 ///
1577 /// \param __a
1578 /// A pointer to a 64-bit memory location. The address of the memory
1579 /// location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1582  struct __loadu_si64 {
1583  long long __v;
1584  } __attribute__((__packed__, __may_alias__));
1585  long long __u = ((const struct __loadu_si64 *)__a)->__v;
1586  return __extension__(__m128i)(__v2di){__u, 0LL};
1587 }
1588 
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 /// vector and clears the upper element.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1595 ///
1596 /// \param __a
1597 /// A pointer to a 32-bit memory location. The address of the memory
1598 /// location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1601  struct __loadu_si32 {
1602  int __v;
1603  } __attribute__((__packed__, __may_alias__));
1604  int __u = ((const struct __loadu_si32 *)__a)->__v;
1605  return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1606 }
1607 
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 /// vector and clears the upper element.
1610 ///
1611 /// \headerfile <x86intrin.h>
1612 ///
1613 /// This intrinsic does not correspond to a specific instruction.
1614 ///
1615 /// \param __a
1616 /// A pointer to a 16-bit memory location. The address of the memory
1617 /// location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1620  struct __loadu_si16 {
1621  short __v;
1622  } __attribute__((__packed__, __may_alias__));
1623  short __u = ((const struct __loadu_si16 *)__a)->__v;
1624  return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1625 }
1626 
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 /// 128-bit integer vector and clears the upper element.
1629 ///
1630 /// \headerfile <x86intrin.h>
1631 ///
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1633 ///
1634 /// \param __dp
1635 /// A pointer to a memory location containing a double-precision value.
1636 /// The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1639  struct __mm_load_sd_struct {
1640  double __u;
1641  } __attribute__((__packed__, __may_alias__));
1642  double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1643  return __extension__(__m128d){__u, 0};
1644 }
1645 
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 /// vector of [2 x double]. The low-order bits are copied from the low-order
1648 /// bits of the first operand.
1649 ///
1650 /// \headerfile <x86intrin.h>
1651 ///
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1653 ///
1654 /// \param __a
1655 /// A 128-bit vector of [2 x double]. \n
1656 /// Bits [63:0] are written to bits [63:0] of the result.
1657 /// \param __dp
1658 /// A pointer to a 64-bit memory location containing a double-precision
1659 /// floating-point value that is loaded. The loaded value is written to bits
1660 /// [127:64] of the result. The address of the memory location does not have
1661 /// to be aligned.
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1664  double const *__dp) {
1665  struct __mm_loadh_pd_struct {
1666  double __u;
1667  } __attribute__((__packed__, __may_alias__));
1668  double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1669  return __extension__(__m128d){__a[0], __u};
1670 }
1671 
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 /// vector of [2 x double]. The high-order bits are copied from the
1674 /// high-order bits of the first operand.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1679 ///
1680 /// \param __a
1681 /// A 128-bit vector of [2 x double]. \n
1682 /// Bits [127:64] are written to bits [127:64] of the result.
1683 /// \param __dp
1684 /// A pointer to a 64-bit memory location containing a double-precision
1685 /// floating-point value that is loaded. The loaded value is written to bits
1686 /// [63:0] of the result. The address of the memory location does not have to
1687 /// be aligned.
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1690  double const *__dp) {
1691  struct __mm_loadl_pd_struct {
1692  double __u;
1693  } __attribute__((__packed__, __may_alias__));
1694  double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1695  return __extension__(__m128d){__u, __a[1]};
1696 }
1697 
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 /// unspecified content. This could be used as an argument to another
1700 /// intrinsic function where the argument is required but the value is not
1701 /// actually used.
1702 ///
1703 /// \headerfile <x86intrin.h>
1704 ///
1705 /// This intrinsic has no corresponding instruction.
1706 ///
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708 /// content.
1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1710  return (__m128d)__builtin_ia32_undef128();
1711 }
1712 
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 /// 64 bits of the vector are initialized with the specified double-precision
1715 /// floating-point value. The upper 64 bits are set to zero.
1716 ///
1717 /// \headerfile <x86intrin.h>
1718 ///
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1720 ///
1721 /// \param __w
1722 /// A double-precision floating-point value used to initialize the lower 64
1723 /// bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1726 /// set to zero.
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1728  return __extension__(__m128d){__w, 0};
1729 }
1730 
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 /// of the two double-precision floating-point vector elements set to the
1733 /// specified double-precision floating-point value.
1734 ///
1735 /// \headerfile <x86intrin.h>
1736 ///
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1738 ///
1739 /// \param __w
1740 /// A double-precision floating-point value used to initialize each vector
1741 /// element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1744  return __extension__(__m128d){__w, __w};
1745 }
1746 
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 /// of the two double-precision floating-point vector elements set to the
1749 /// specified double-precision floating-point value.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1754 ///
1755 /// \param __w
1756 /// A double-precision floating-point value used to initialize each vector
1757 /// element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1760  return _mm_set1_pd(__w);
1761 }
1762 
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 /// initialized with the specified double-precision floating-point values.
1765 ///
1766 /// \headerfile <x86intrin.h>
1767 ///
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1769 ///
1770 /// \param __w
1771 /// A double-precision floating-point value used to initialize the upper 64
1772 /// bits of the result.
1773 /// \param __x
1774 /// A double-precision floating-point value used to initialize the lower 64
1775 /// bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1778  double __x) {
1779  return __extension__(__m128d){__x, __w};
1780 }
1781 
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 /// initialized in reverse order with the specified double-precision
1784 /// floating-point values.
1785 ///
1786 /// \headerfile <x86intrin.h>
1787 ///
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1789 ///
1790 /// \param __w
1791 /// A double-precision floating-point value used to initialize the lower 64
1792 /// bits of the result.
1793 /// \param __x
1794 /// A double-precision floating-point value used to initialize the upper 64
1795 /// bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1798  double __x) {
1799  return __extension__(__m128d){__w, __x};
1800 }
1801 
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 /// initialized to zero.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1808 ///
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 /// all elements set to zero.
1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1812  return __extension__(__m128d){0, 0};
1813 }
1814 
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1817 /// 64 bits are set to the upper 64 bits of the first parameter.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1822 ///
1823 /// \param __a
1824 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 /// upper 64 bits of the result.
1826 /// \param __b
1827 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 /// lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1831  __m128d __b) {
1832  __a[0] = __b[0];
1833  return __a;
1834 }
1835 
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 /// memory location.
1838 ///
1839 /// \headerfile <x86intrin.h>
1840 ///
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1842 ///
1843 /// \param __dp
1844 /// A pointer to a 64-bit memory location.
1845 /// \param __a
1846 /// A 128-bit vector of [2 x double] containing the value to be stored.
1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1848  __m128d __a) {
1849  struct __mm_store_sd_struct {
1850  double __u;
1851  } __attribute__((__packed__, __may_alias__));
1852  ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1853 }
1854 
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 /// [2 x double] to a memory location.
1857 ///
1858 /// \headerfile <x86intrin.h>
1859 ///
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1861 ///
1862 /// \param __dp
1863 /// A pointer to an aligned memory location that can store two
1864 /// double-precision values.
1865 /// \param __a
1866 /// A packed 128-bit vector of [2 x double] containing the values to be
1867 /// moved.
1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1869  __m128d __a) {
1870  *(__m128d *)__dp = __a;
1871 }
1872 
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 /// the upper and lower 64 bits of a memory location.
1875 ///
1876 /// \headerfile <x86intrin.h>
1877 ///
1878 /// This intrinsic corresponds to the
1879 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1880 ///
1881 /// \param __dp
1882 /// A pointer to a memory location that can store two double-precision
1883 /// values.
1884 /// \param __a
1885 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 /// of the values in \a __dp.
1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1888  __m128d __a) {
1889  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1890  _mm_store_pd(__dp, __a);
1891 }
1892 
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 /// the upper and lower 64 bits of a memory location.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the
1899 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1900 ///
1901 /// \param __dp
1902 /// A pointer to a memory location that can store two double-precision
1903 /// values.
1904 /// \param __a
1905 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 /// of the values in \a __dp.
1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1908  __m128d __a) {
1909  _mm_store1_pd(__dp, __a);
1910 }
1911 
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1913 /// location.
1914 ///
1915 /// \headerfile <x86intrin.h>
1916 ///
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1918 ///
1919 /// \param __dp
1920 /// A pointer to a 128-bit memory location. The address of the memory
1921 /// location does not have to be aligned.
1922 /// \param __a
1923 /// A 128-bit vector of [2 x double] containing the values to be stored.
1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1925  __m128d __a) {
1926  struct __storeu_pd {
1927  __m128d_u __v;
1928  } __attribute__((__packed__, __may_alias__));
1929  ((struct __storeu_pd *)__dp)->__v = __a;
1930 }
1931 
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 /// vector of [2 x double] to a 16-byte aligned memory location.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1939 ///
1940 /// \param __dp
1941 /// A pointer to a 16-byte aligned memory location that can store two
1942 /// double-precision values.
1943 /// \param __a
1944 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1945 /// stored.
1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1947  __m128d __a) {
1948  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1949  *(__m128d *)__dp = __a;
1950 }
1951 
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 /// memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1958 ///
1959 /// \param __dp
1960 /// A pointer to a 64-bit memory location.
1961 /// \param __a
1962 /// A 128-bit vector of [2 x double] containing the value to be stored.
1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1964  __m128d __a) {
1965  struct __mm_storeh_pd_struct {
1966  double __u;
1967  } __attribute__((__packed__, __may_alias__));
1968  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1969 }
1970 
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 /// memory location.
1973 ///
1974 /// \headerfile <x86intrin.h>
1975 ///
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1977 ///
1978 /// \param __dp
1979 /// A pointer to a 64-bit memory location.
1980 /// \param __a
1981 /// A 128-bit vector of [2 x double] containing the value to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1983  __m128d __a) {
1984  struct __mm_storeh_pd_struct {
1985  double __u;
1986  } __attribute__((__packed__, __may_alias__));
1987  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1988 }
1989 
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 /// saving the lower 8 bits of each sum in the corresponding element of a
1992 /// 128-bit result vector of [16 x i8].
1993 ///
1994 /// The integer elements of both parameters can be either signed or unsigned.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1999 ///
2000 /// \param __a
2001 /// A 128-bit vector of [16 x i8].
2002 /// \param __b
2003 /// A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2005 /// parameters.
2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2007  __m128i __b) {
2008  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2009 }
2010 
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 /// saving the lower 16 bits of each sum in the corresponding element of a
2013 /// 128-bit result vector of [8 x i16].
2014 ///
2015 /// The integer elements of both parameters can be either signed or unsigned.
2016 ///
2017 /// \headerfile <x86intrin.h>
2018 ///
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2020 ///
2021 /// \param __a
2022 /// A 128-bit vector of [8 x i16].
2023 /// \param __b
2024 /// A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2026 /// parameters.
2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2028  __m128i __b) {
2029  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2030 }
2031 
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 /// saving the lower 32 bits of each sum in the corresponding element of a
2034 /// 128-bit result vector of [4 x i32].
2035 ///
2036 /// The integer elements of both parameters can be either signed or unsigned.
2037 ///
2038 /// \headerfile <x86intrin.h>
2039 ///
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2041 ///
2042 /// \param __a
2043 /// A 128-bit vector of [4 x i32].
2044 /// \param __b
2045 /// A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2047 /// parameters.
2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2049  __m128i __b) {
2050  return (__m128i)((__v4su)__a + (__v4su)__b);
2051 }
2052 
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 /// lower 64 bits of the sum.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2059 ///
2060 /// \param __a
2061 /// A 64-bit integer.
2062 /// \param __b
2063 /// A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2066  __m64 __b) {
2067  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2068 }
2069 
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 /// saving the lower 64 bits of each sum in the corresponding element of a
2072 /// 128-bit result vector of [2 x i64].
2073 ///
2074 /// The integer elements of both parameters can be either signed or unsigned.
2075 ///
2076 /// \headerfile <x86intrin.h>
2077 ///
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2079 ///
2080 /// \param __a
2081 /// A 128-bit vector of [2 x i64].
2082 /// \param __b
2083 /// A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2085 /// parameters.
2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2087  __m128i __b) {
2088  return (__m128i)((__v2du)__a + (__v2du)__b);
2089 }
2090 
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2093 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2094 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2095 ///
2096 /// \headerfile <x86intrin.h>
2097 ///
2098 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2099 ///
2100 /// \param __a
2101 /// A 128-bit signed [16 x i8] vector.
2102 /// \param __b
2103 /// A 128-bit signed [16 x i8] vector.
2104 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2105 /// both parameters.
2106 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2107  __m128i __b) {
2108  return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2109 }
2110 
2111 /// Adds, with saturation, the corresponding elements of two 128-bit
2112 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2113 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2114 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2115 /// 0x8000.
2116 ///
2117 /// \headerfile <x86intrin.h>
2118 ///
2119 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2120 ///
2121 /// \param __a
2122 /// A 128-bit signed [8 x i16] vector.
2123 /// \param __b
2124 /// A 128-bit signed [8 x i16] vector.
2125 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2126 /// both parameters.
2127 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2128  __m128i __b) {
2129  return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2130 }
2131 
2132 /// Adds, with saturation, the corresponding elements of two 128-bit
2133 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2134 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2135 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2136 ///
2137 /// \headerfile <x86intrin.h>
2138 ///
2139 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2140 ///
2141 /// \param __a
2142 /// A 128-bit unsigned [16 x i8] vector.
2143 /// \param __b
2144 /// A 128-bit unsigned [16 x i8] vector.
2145 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2146 /// of both parameters.
2147 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2148  __m128i __b) {
2149  return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2150 }
2151 
2152 /// Adds, with saturation, the corresponding elements of two 128-bit
2153 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2154 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2155 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2156 ///
2157 /// \headerfile <x86intrin.h>
2158 ///
2159 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2160 ///
2161 /// \param __a
2162 /// A 128-bit unsigned [8 x i16] vector.
2163 /// \param __b
2164 /// A 128-bit unsigned [8 x i16] vector.
2165 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2166 /// of both parameters.
2167 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2168  __m128i __b) {
2169  return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2170 }
2171 
2172 /// Computes the rounded averages of corresponding elements of two
2173 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2174 /// corresponding element of a 128-bit result vector of [16 x i8].
2175 ///
2176 /// \headerfile <x86intrin.h>
2177 ///
2178 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2179 ///
2180 /// \param __a
2181 /// A 128-bit unsigned [16 x i8] vector.
2182 /// \param __b
2183 /// A 128-bit unsigned [16 x i8] vector.
2184 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2185 /// averages of both parameters.
2186 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2187  __m128i __b) {
2188  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2189 }
2190 
2191 /// Computes the rounded averages of corresponding elements of two
2192 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2193 /// corresponding element of a 128-bit result vector of [8 x i16].
2194 ///
2195 /// \headerfile <x86intrin.h>
2196 ///
2197 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2198 ///
2199 /// \param __a
2200 /// A 128-bit unsigned [8 x i16] vector.
2201 /// \param __b
2202 /// A 128-bit unsigned [8 x i16] vector.
2203 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2204 /// averages of both parameters.
2205 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2206  __m128i __b) {
2207  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2208 }
2209 
2210 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2211 /// vectors, producing eight intermediate 32-bit signed integer products, and
2212 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2213 /// [4 x i32] vector.
2214 ///
2215 /// For example, bits [15:0] of both parameters are multiplied producing a
2216 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2217 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2218 /// of the result.
2219 ///
2220 /// \headerfile <x86intrin.h>
2221 ///
2222 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2223 ///
2224 /// \param __a
2225 /// A 128-bit signed [8 x i16] vector.
2226 /// \param __b
2227 /// A 128-bit signed [8 x i16] vector.
2228 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2229 /// of both parameters.
2230 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2231  __m128i __b) {
2232  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2233 }
2234 
2235 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2236 /// vectors, saving the greater value from each comparison in the
2237 /// corresponding element of a 128-bit result vector of [8 x i16].
2238 ///
2239 /// \headerfile <x86intrin.h>
2240 ///
2241 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2242 ///
2243 /// \param __a
2244 /// A 128-bit signed [8 x i16] vector.
2245 /// \param __b
2246 /// A 128-bit signed [8 x i16] vector.
2247 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2248 /// each comparison.
2249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2250  __m128i __b) {
2251  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2252 }
2253 
2254 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2255 /// vectors, saving the greater value from each comparison in the
2256 /// corresponding element of a 128-bit result vector of [16 x i8].
2257 ///
2258 /// \headerfile <x86intrin.h>
2259 ///
2260 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2261 ///
2262 /// \param __a
2263 /// A 128-bit unsigned [16 x i8] vector.
2264 /// \param __b
2265 /// A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2267 /// each comparison.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2269  __m128i __b) {
2270  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2271 }
2272 
2273 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2274 /// vectors, saving the smaller value from each comparison in the
2275 /// corresponding element of a 128-bit result vector of [8 x i16].
2276 ///
2277 /// \headerfile <x86intrin.h>
2278 ///
2279 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2280 ///
2281 /// \param __a
2282 /// A 128-bit signed [8 x i16] vector.
2283 /// \param __b
2284 /// A 128-bit signed [8 x i16] vector.
2285 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2286 /// each comparison.
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2288  __m128i __b) {
2289  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2290 }
2291 
2292 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2293 /// vectors, saving the smaller value from each comparison in the
2294 /// corresponding element of a 128-bit result vector of [16 x i8].
2295 ///
2296 /// \headerfile <x86intrin.h>
2297 ///
2298 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2299 ///
2300 /// \param __a
2301 /// A 128-bit unsigned [16 x i8] vector.
2302 /// \param __b
2303 /// A 128-bit unsigned [16 x i8] vector.
2304 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2305 /// each comparison.
2306 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2307  __m128i __b) {
2308  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2309 }
2310 
2311 /// Multiplies the corresponding elements of two signed [8 x i16]
2312 /// vectors, saving the upper 16 bits of each 32-bit product in the
2313 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2318 ///
2319 /// \param __a
2320 /// A 128-bit signed [8 x i16] vector.
2321 /// \param __b
2322 /// A 128-bit signed [8 x i16] vector.
2323 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2324 /// each of the eight 32-bit products.
2325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2326  __m128i __b) {
2327  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2328 }
2329 
2330 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2331 /// vectors, saving the upper 16 bits of each 32-bit product in the
2332 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2333 ///
2334 /// \headerfile <x86intrin.h>
2335 ///
2336 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2337 ///
2338 /// \param __a
2339 /// A 128-bit unsigned [8 x i16] vector.
2340 /// \param __b
2341 /// A 128-bit unsigned [8 x i16] vector.
2342 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2343 /// of each of the eight 32-bit products.
2344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2345  __m128i __b) {
2346  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2347 }
2348 
2349 /// Multiplies the corresponding elements of two signed [8 x i16]
2350 /// vectors, saving the lower 16 bits of each 32-bit product in the
2351 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2352 ///
2353 /// \headerfile <x86intrin.h>
2354 ///
2355 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2356 ///
2357 /// \param __a
2358 /// A 128-bit signed [8 x i16] vector.
2359 /// \param __b
2360 /// A 128-bit signed [8 x i16] vector.
2361 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2362 /// each of the eight 32-bit products.
2363 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2364  __m128i __b) {
2365  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2366 }
2367 
2368 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2369 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2370 /// product.
2371 ///
2372 /// \headerfile <x86intrin.h>
2373 ///
2374 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2375 ///
2376 /// \param __a
2377 /// A 64-bit integer containing one of the source operands.
2378 /// \param __b
2379 /// A 64-bit integer containing one of the source operands.
2380 /// \returns A 64-bit integer vector containing the product of both operands.
2381 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2382  __m64 __b) {
2383  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2384 }
2385 
2386 /// Multiplies 32-bit unsigned integer values contained in the lower
2387 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2388 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2389 ///
2390 /// \headerfile <x86intrin.h>
2391 ///
2392 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2393 ///
2394 /// \param __a
2395 /// A [2 x i64] vector containing one of the source operands.
2396 /// \param __b
2397 /// A [2 x i64] vector containing one of the source operands.
2398 /// \returns A [2 x i64] vector containing the product of both operands.
2399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2400  __m128i __b) {
2401  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2402 }
2403 
2404 /// Computes the absolute differences of corresponding 8-bit integer
2405 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2406 /// separately sums the second 8 absolute differences. Packs these two
2407 /// unsigned 16-bit integer sums into the upper and lower elements of a
2408 /// [2 x i64] vector.
2409 ///
2410 /// \headerfile <x86intrin.h>
2411 ///
2412 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2413 ///
2414 /// \param __a
2415 /// A 128-bit integer vector containing one of the source operands.
2416 /// \param __b
2417 /// A 128-bit integer vector containing one of the source operands.
2418 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2419 /// differences between both operands.
2420 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2421  __m128i __b) {
2422  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2423 }
2424 
2425 /// Subtracts the corresponding 8-bit integer values in the operands.
2426 ///
2427 /// \headerfile <x86intrin.h>
2428 ///
2429 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2430 ///
2431 /// \param __a
2432 /// A 128-bit integer vector containing the minuends.
2433 /// \param __b
2434 /// A 128-bit integer vector containing the subtrahends.
2435 /// \returns A 128-bit integer vector containing the differences of the values
2436 /// in the operands.
2437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2438  __m128i __b) {
2439  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2440 }
2441 
2442 /// Subtracts the corresponding 16-bit integer values in the operands.
2443 ///
2444 /// \headerfile <x86intrin.h>
2445 ///
2446 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2447 ///
2448 /// \param __a
2449 /// A 128-bit integer vector containing the minuends.
2450 /// \param __b
2451 /// A 128-bit integer vector containing the subtrahends.
2452 /// \returns A 128-bit integer vector containing the differences of the values
2453 /// in the operands.
2454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2455  __m128i __b) {
2456  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2457 }
2458 
2459 /// Subtracts the corresponding 32-bit integer values in the operands.
2460 ///
2461 /// \headerfile <x86intrin.h>
2462 ///
2463 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2464 ///
2465 /// \param __a
2466 /// A 128-bit integer vector containing the minuends.
2467 /// \param __b
2468 /// A 128-bit integer vector containing the subtrahends.
2469 /// \returns A 128-bit integer vector containing the differences of the values
2470 /// in the operands.
2471 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2472  __m128i __b) {
2473  return (__m128i)((__v4su)__a - (__v4su)__b);
2474 }
2475 
2476 /// Subtracts signed or unsigned 64-bit integer values and writes the
2477 /// difference to the corresponding bits in the destination.
2478 ///
2479 /// \headerfile <x86intrin.h>
2480 ///
2481 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2482 ///
2483 /// \param __a
2484 /// A 64-bit integer vector containing the minuend.
2485 /// \param __b
2486 /// A 64-bit integer vector containing the subtrahend.
2487 /// \returns A 64-bit integer vector containing the difference of the values in
2488 /// the operands.
2489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2490  __m64 __b) {
2491  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2492 }
2493 
2494 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2495 ///
2496 /// \headerfile <x86intrin.h>
2497 ///
2498 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2499 ///
2500 /// \param __a
2501 /// A 128-bit integer vector containing the minuends.
2502 /// \param __b
2503 /// A 128-bit integer vector containing the subtrahends.
2504 /// \returns A 128-bit integer vector containing the differences of the values
2505 /// in the operands.
2506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2507  __m128i __b) {
2508  return (__m128i)((__v2du)__a - (__v2du)__b);
2509 }
2510 
2511 /// Subtracts corresponding 8-bit signed integer values in the input and
2512 /// returns the differences in the corresponding bytes in the destination.
2513 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2514 /// than 0x80 are saturated to 0x80.
2515 ///
2516 /// \headerfile <x86intrin.h>
2517 ///
2518 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2519 ///
2520 /// \param __a
2521 /// A 128-bit integer vector containing the minuends.
2522 /// \param __b
2523 /// A 128-bit integer vector containing the subtrahends.
2524 /// \returns A 128-bit integer vector containing the differences of the values
2525 /// in the operands.
2526 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2527  __m128i __b) {
2528  return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2529 }
2530 
2531 /// Subtracts corresponding 16-bit signed integer values in the input and
2532 /// returns the differences in the corresponding bytes in the destination.
2533 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2534 /// than 0x8000 are saturated to 0x8000.
2535 ///
2536 /// \headerfile <x86intrin.h>
2537 ///
2538 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2539 ///
2540 /// \param __a
2541 /// A 128-bit integer vector containing the minuends.
2542 /// \param __b
2543 /// A 128-bit integer vector containing the subtrahends.
2544 /// \returns A 128-bit integer vector containing the differences of the values
2545 /// in the operands.
2546 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2547  __m128i __b) {
2548  return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2549 }
2550 
2551 /// Subtracts corresponding 8-bit unsigned integer values in the input
2552 /// and returns the differences in the corresponding bytes in the
2553 /// destination. Differences less than 0x00 are saturated to 0x00.
2554 ///
2555 /// \headerfile <x86intrin.h>
2556 ///
2557 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2558 ///
2559 /// \param __a
2560 /// A 128-bit integer vector containing the minuends.
2561 /// \param __b
2562 /// A 128-bit integer vector containing the subtrahends.
2563 /// \returns A 128-bit integer vector containing the unsigned integer
2564 /// differences of the values in the operands.
2565 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2566  __m128i __b) {
2567  return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2568 }
2569 
2570 /// Subtracts corresponding 16-bit unsigned integer values in the input
2571 /// and returns the differences in the corresponding bytes in the
2572 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2573 ///
2574 /// \headerfile <x86intrin.h>
2575 ///
2576 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2577 ///
2578 /// \param __a
2579 /// A 128-bit integer vector containing the minuends.
2580 /// \param __b
2581 /// A 128-bit integer vector containing the subtrahends.
2582 /// \returns A 128-bit integer vector containing the unsigned integer
2583 /// differences of the values in the operands.
2584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2585  __m128i __b) {
2586  return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2587 }
2588 
2589 /// Performs a bitwise AND of two 128-bit integer vectors.
2590 ///
2591 /// \headerfile <x86intrin.h>
2592 ///
2593 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2594 ///
2595 /// \param __a
2596 /// A 128-bit integer vector containing one of the source operands.
2597 /// \param __b
2598 /// A 128-bit integer vector containing one of the source operands.
2599 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2600 /// in both operands.
2601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2602  __m128i __b) {
2603  return (__m128i)((__v2du)__a & (__v2du)__b);
2604 }
2605 
2606 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2607 /// one's complement of the values contained in the first source operand.
2608 ///
2609 /// \headerfile <x86intrin.h>
2610 ///
2611 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2612 ///
2613 /// \param __a
2614 /// A 128-bit vector containing the left source operand. The one's complement
2615 /// of this value is used in the bitwise AND.
2616 /// \param __b
2617 /// A 128-bit vector containing the right source operand.
2618 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2619 /// complement of the first operand and the values in the second operand.
2620 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2621  __m128i __b) {
2622  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2623 }
2624 /// Performs a bitwise OR of two 128-bit integer vectors.
2625 ///
2626 /// \headerfile <x86intrin.h>
2627 ///
2628 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2629 ///
2630 /// \param __a
2631 /// A 128-bit integer vector containing one of the source operands.
2632 /// \param __b
2633 /// A 128-bit integer vector containing one of the source operands.
2634 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2635 /// in both operands.
2636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2637  __m128i __b) {
2638  return (__m128i)((__v2du)__a | (__v2du)__b);
2639 }
2640 
2641 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2642 ///
2643 /// \headerfile <x86intrin.h>
2644 ///
2645 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2646 ///
2647 /// \param __a
2648 /// A 128-bit integer vector containing one of the source operands.
2649 /// \param __b
2650 /// A 128-bit integer vector containing one of the source operands.
2651 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2652 /// values in both operands.
2653 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2654  __m128i __b) {
2655  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2656 }
2657 
2658 /// Left-shifts the 128-bit integer vector operand by the specified
2659 /// number of bytes. Low-order bits are cleared.
2660 ///
2661 /// \headerfile <x86intrin.h>
2662 ///
2663 /// \code
2664 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2665 /// \endcode
2666 ///
2667 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2668 ///
2669 /// \param a
2670 /// A 128-bit integer vector containing the source operand.
2671 /// \param imm
2672 /// An immediate value specifying the number of bytes to left-shift operand
2673 /// \a a.
2674 /// \returns A 128-bit integer vector containing the left-shifted value.
2675 #define _mm_slli_si128(a, imm) \
2676  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2677  (int)(imm)))
2678 
2679 #define _mm_bslli_si128(a, imm) \
2680  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2681  (int)(imm)))
2682 
2683 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2684 /// by the specified number of bits. Low-order bits are cleared.
2685 ///
2686 /// \headerfile <x86intrin.h>
2687 ///
2688 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2689 ///
2690 /// \param __a
2691 /// A 128-bit integer vector containing the source operand.
2692 /// \param __count
2693 /// An integer value specifying the number of bits to left-shift each value
2694 /// in operand \a __a.
2695 /// \returns A 128-bit integer vector containing the left-shifted values.
2696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2697  int __count) {
2698  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2699 }
2700 
2701 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2702 /// by the specified number of bits. Low-order bits are cleared.
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2707 ///
2708 /// \param __a
2709 /// A 128-bit integer vector containing the source operand.
2710 /// \param __count
2711 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2712 /// to left-shift each value in operand \a __a.
2713 /// \returns A 128-bit integer vector containing the left-shifted values.
2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2715  __m128i __count) {
2716  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2717 }
2718 
2719 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2720 /// by the specified number of bits. Low-order bits are cleared.
2721 ///
2722 /// \headerfile <x86intrin.h>
2723 ///
2724 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2725 ///
2726 /// \param __a
2727 /// A 128-bit integer vector containing the source operand.
2728 /// \param __count
2729 /// An integer value specifying the number of bits to left-shift each value
2730 /// in operand \a __a.
2731 /// \returns A 128-bit integer vector containing the left-shifted values.
2732 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2733  int __count) {
2734  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2735 }
2736 
2737 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2738 /// by the specified number of bits. Low-order bits are cleared.
2739 ///
2740 /// \headerfile <x86intrin.h>
2741 ///
2742 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2743 ///
2744 /// \param __a
2745 /// A 128-bit integer vector containing the source operand.
2746 /// \param __count
2747 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2748 /// to left-shift each value in operand \a __a.
2749 /// \returns A 128-bit integer vector containing the left-shifted values.
2750 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2751  __m128i __count) {
2752  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2753 }
2754 
2755 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2756 /// by the specified number of bits. Low-order bits are cleared.
2757 ///
2758 /// \headerfile <x86intrin.h>
2759 ///
2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2761 ///
2762 /// \param __a
2763 /// A 128-bit integer vector containing the source operand.
2764 /// \param __count
2765 /// An integer value specifying the number of bits to left-shift each value
2766 /// in operand \a __a.
2767 /// \returns A 128-bit integer vector containing the left-shifted values.
2768 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2769  int __count) {
2770  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2771 }
2772 
2773 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2774 /// by the specified number of bits. Low-order bits are cleared.
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2779 ///
2780 /// \param __a
2781 /// A 128-bit integer vector containing the source operand.
2782 /// \param __count
2783 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2784 /// to left-shift each value in operand \a __a.
2785 /// \returns A 128-bit integer vector containing the left-shifted values.
2786 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2787  __m128i __count) {
2788  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2789 }
2790 
2791 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2792 /// by the specified number of bits. High-order bits are filled with the sign
2793 /// bit of the initial value.
2794 ///
2795 /// \headerfile <x86intrin.h>
2796 ///
2797 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2798 ///
2799 /// \param __a
2800 /// A 128-bit integer vector containing the source operand.
2801 /// \param __count
2802 /// An integer value specifying the number of bits to right-shift each value
2803 /// in operand \a __a.
2804 /// \returns A 128-bit integer vector containing the right-shifted values.
2805 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2806  int __count) {
2807  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2808 }
2809 
2810 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2811 /// by the specified number of bits. High-order bits are filled with the sign
2812 /// bit of the initial value.
2813 ///
2814 /// \headerfile <x86intrin.h>
2815 ///
2816 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2817 ///
2818 /// \param __a
2819 /// A 128-bit integer vector containing the source operand.
2820 /// \param __count
2821 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2822 /// to right-shift each value in operand \a __a.
2823 /// \returns A 128-bit integer vector containing the right-shifted values.
2824 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2825  __m128i __count) {
2826  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2827 }
2828 
2829 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2830 /// by the specified number of bits. High-order bits are filled with the sign
2831 /// bit of the initial value.
2832 ///
2833 /// \headerfile <x86intrin.h>
2834 ///
2835 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2836 ///
2837 /// \param __a
2838 /// A 128-bit integer vector containing the source operand.
2839 /// \param __count
2840 /// An integer value specifying the number of bits to right-shift each value
2841 /// in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the right-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2844  int __count) {
2845  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2846 }
2847 
2848 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2849 /// by the specified number of bits. High-order bits are filled with the sign
2850 /// bit of the initial value.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2855 ///
2856 /// \param __a
2857 /// A 128-bit integer vector containing the source operand.
2858 /// \param __count
2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 /// to right-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the right-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2863  __m128i __count) {
2864  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2865 }
2866 
2867 /// Right-shifts the 128-bit integer vector operand by the specified
2868 /// number of bytes. High-order bits are cleared.
2869 ///
2870 /// \headerfile <x86intrin.h>
2871 ///
2872 /// \code
2873 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2874 /// \endcode
2875 ///
2876 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2877 ///
2878 /// \param a
2879 /// A 128-bit integer vector containing the source operand.
2880 /// \param imm
2881 /// An immediate value specifying the number of bytes to right-shift operand
2882 /// \a a.
2883 /// \returns A 128-bit integer vector containing the right-shifted value.
2884 #define _mm_srli_si128(a, imm) \
2885  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2886  (int)(imm)))
2887 
2888 #define _mm_bsrli_si128(a, imm) \
2889  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2890  (int)(imm)))
2891 
2892 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2893 /// operand by the specified number of bits. High-order bits are cleared.
2894 ///
2895 /// \headerfile <x86intrin.h>
2896 ///
2897 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2898 ///
2899 /// \param __a
2900 /// A 128-bit integer vector containing the source operand.
2901 /// \param __count
2902 /// An integer value specifying the number of bits to right-shift each value
2903 /// in operand \a __a.
2904 /// \returns A 128-bit integer vector containing the right-shifted values.
2905 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2906  int __count) {
2907  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2908 }
2909 
2910 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2911 /// operand by the specified number of bits. High-order bits are cleared.
2912 ///
2913 /// \headerfile <x86intrin.h>
2914 ///
2915 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2916 ///
2917 /// \param __a
2918 /// A 128-bit integer vector containing the source operand.
2919 /// \param __count
2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 /// to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2924  __m128i __count) {
2925  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2926 }
2927 
2928 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2929 /// operand by the specified number of bits. High-order bits are cleared.
2930 ///
2931 /// \headerfile <x86intrin.h>
2932 ///
2933 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2934 ///
2935 /// \param __a
2936 /// A 128-bit integer vector containing the source operand.
2937 /// \param __count
2938 /// An integer value specifying the number of bits to right-shift each value
2939 /// in operand \a __a.
2940 /// \returns A 128-bit integer vector containing the right-shifted values.
2941 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2942  int __count) {
2943  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2944 }
2945 
2946 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2947 /// operand by the specified number of bits. High-order bits are cleared.
2948 ///
2949 /// \headerfile <x86intrin.h>
2950 ///
2951 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2952 ///
2953 /// \param __a
2954 /// A 128-bit integer vector containing the source operand.
2955 /// \param __count
2956 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2957 /// to right-shift each value in operand \a __a.
2958 /// \returns A 128-bit integer vector containing the right-shifted values.
2959 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2960  __m128i __count) {
2961  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2962 }
2963 
2964 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2965 /// operand by the specified number of bits. High-order bits are cleared.
2966 ///
2967 /// \headerfile <x86intrin.h>
2968 ///
2969 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2970 ///
2971 /// \param __a
2972 /// A 128-bit integer vector containing the source operand.
2973 /// \param __count
2974 /// An integer value specifying the number of bits to right-shift each value
2975 /// in operand \a __a.
2976 /// \returns A 128-bit integer vector containing the right-shifted values.
2977 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2978  int __count) {
2979  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2980 }
2981 
2982 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2983 /// operand by the specified number of bits. High-order bits are cleared.
2984 ///
2985 /// \headerfile <x86intrin.h>
2986 ///
2987 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2988 ///
2989 /// \param __a
2990 /// A 128-bit integer vector containing the source operand.
2991 /// \param __count
2992 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2993 /// to right-shift each value in operand \a __a.
2994 /// \returns A 128-bit integer vector containing the right-shifted values.
2995 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
2996  __m128i __count) {
2997  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
2998 }
2999 
3000 /// Compares each of the corresponding 8-bit values of the 128-bit
3001 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3002 /// for true.
3003 ///
3004 /// \headerfile <x86intrin.h>
3005 ///
3006 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3007 ///
3008 /// \param __a
3009 /// A 128-bit integer vector.
3010 /// \param __b
3011 /// A 128-bit integer vector.
3012 /// \returns A 128-bit integer vector containing the comparison results.
3013 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3014  __m128i __b) {
3015  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3016 }
3017 
3018 /// Compares each of the corresponding 16-bit values of the 128-bit
3019 /// integer vectors for equality. Each comparison yields 0x0 for false,
3020 /// 0xFFFF for true.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3025 ///
3026 /// \param __a
3027 /// A 128-bit integer vector.
3028 /// \param __b
3029 /// A 128-bit integer vector.
3030 /// \returns A 128-bit integer vector containing the comparison results.
3031 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3032  __m128i __b) {
3033  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3034 }
3035 
3036 /// Compares each of the corresponding 32-bit values of the 128-bit
3037 /// integer vectors for equality. Each comparison yields 0x0 for false,
3038 /// 0xFFFFFFFF for true.
3039 ///
3040 /// \headerfile <x86intrin.h>
3041 ///
3042 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3043 ///
3044 /// \param __a
3045 /// A 128-bit integer vector.
3046 /// \param __b
3047 /// A 128-bit integer vector.
3048 /// \returns A 128-bit integer vector containing the comparison results.
3049 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3050  __m128i __b) {
3051  return (__m128i)((__v4si)__a == (__v4si)__b);
3052 }
3053 
3054 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3055 /// integer vectors to determine if the values in the first operand are
3056 /// greater than those in the second operand. Each comparison yields 0x0 for
3057 /// false, 0xFF for true.
3058 ///
3059 /// \headerfile <x86intrin.h>
3060 ///
3061 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3062 ///
3063 /// \param __a
3064 /// A 128-bit integer vector.
3065 /// \param __b
3066 /// A 128-bit integer vector.
3067 /// \returns A 128-bit integer vector containing the comparison results.
3068 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3069  __m128i __b) {
3070  /* This function always performs a signed comparison, but __v16qi is a char
3071  which may be signed or unsigned, so use __v16qs. */
3072  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3073 }
3074 
3075 /// Compares each of the corresponding signed 16-bit values of the
3076 /// 128-bit integer vectors to determine if the values in the first operand
3077 /// are greater than those in the second operand.
3078 ///
3079 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3080 ///
3081 /// \headerfile <x86intrin.h>
3082 ///
3083 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3084 ///
3085 /// \param __a
3086 /// A 128-bit integer vector.
3087 /// \param __b
3088 /// A 128-bit integer vector.
3089 /// \returns A 128-bit integer vector containing the comparison results.
3090 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3091  __m128i __b) {
3092  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3093 }
3094 
3095 /// Compares each of the corresponding signed 32-bit values of the
3096 /// 128-bit integer vectors to determine if the values in the first operand
3097 /// are greater than those in the second operand.
3098 ///
3099 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3100 ///
3101 /// \headerfile <x86intrin.h>
3102 ///
3103 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3104 ///
3105 /// \param __a
3106 /// A 128-bit integer vector.
3107 /// \param __b
3108 /// A 128-bit integer vector.
3109 /// \returns A 128-bit integer vector containing the comparison results.
3110 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3111  __m128i __b) {
3112  return (__m128i)((__v4si)__a > (__v4si)__b);
3113 }
3114 
3115 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3116 /// integer vectors to determine if the values in the first operand are less
3117 /// than those in the second operand.
3118 ///
3119 /// Each comparison yields 0x0 for false, 0xFF for true.
3120 ///
3121 /// \headerfile <x86intrin.h>
3122 ///
3123 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3124 ///
3125 /// \param __a
3126 /// A 128-bit integer vector.
3127 /// \param __b
3128 /// A 128-bit integer vector.
3129 /// \returns A 128-bit integer vector containing the comparison results.
3130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3131  __m128i __b) {
3132  return _mm_cmpgt_epi8(__b, __a);
3133 }
3134 
3135 /// Compares each of the corresponding signed 16-bit values of the
3136 /// 128-bit integer vectors to determine if the values in the first operand
3137 /// are less than those in the second operand.
3138 ///
3139 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3140 ///
3141 /// \headerfile <x86intrin.h>
3142 ///
3143 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3144 ///
3145 /// \param __a
3146 /// A 128-bit integer vector.
3147 /// \param __b
3148 /// A 128-bit integer vector.
3149 /// \returns A 128-bit integer vector containing the comparison results.
3150 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3151  __m128i __b) {
3152  return _mm_cmpgt_epi16(__b, __a);
3153 }
3154 
3155 /// Compares each of the corresponding signed 32-bit values of the
3156 /// 128-bit integer vectors to determine if the values in the first operand
3157 /// are less than those in the second operand.
3158 ///
3159 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3160 ///
3161 /// \headerfile <x86intrin.h>
3162 ///
3163 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3164 ///
3165 /// \param __a
3166 /// A 128-bit integer vector.
3167 /// \param __b
3168 /// A 128-bit integer vector.
3169 /// \returns A 128-bit integer vector containing the comparison results.
3170 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3171  __m128i __b) {
3172  return _mm_cmpgt_epi32(__b, __a);
3173 }
3174 
3175 #ifdef __x86_64__
3176 /// Converts a 64-bit signed integer value from the second operand into a
3177 /// double-precision value and returns it in the lower element of a [2 x
3178 /// double] vector; the upper element of the returned vector is copied from
3179 /// the upper element of the first operand.
3180 ///
3181 /// \headerfile <x86intrin.h>
3182 ///
3183 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3184 ///
3185 /// \param __a
3186 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3187 /// copied to the upper 64 bits of the destination.
3188 /// \param __b
3189 /// A 64-bit signed integer operand containing the value to be converted.
3190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3191 /// converted value of the second operand. The upper 64 bits are copied from
3192 /// the upper 64 bits of the first operand.
3193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3194  long long __b) {
3195  __a[0] = __b;
3196  return __a;
3197 }
3198 
3199 /// Converts the first (lower) element of a vector of [2 x double] into a
3200 /// 64-bit signed integer value, according to the current rounding mode.
3201 ///
3202 /// \headerfile <x86intrin.h>
3203 ///
3204 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3205 ///
3206 /// \param __a
3207 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3208 /// conversion.
3209 /// \returns A 64-bit signed integer containing the converted value.
3210 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3211  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3212 }
3213 
3214 /// Converts the first (lower) element of a vector of [2 x double] into a
3215 /// 64-bit signed integer value, truncating the result when it is inexact.
3216 ///
3217 /// \headerfile <x86intrin.h>
3218 ///
3219 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3220 /// instruction.
3221 ///
3222 /// \param __a
3223 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3224 /// conversion.
3225 /// \returns A 64-bit signed integer containing the converted value.
3226 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3227  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3228 }
3229 #endif
3230 
3231 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3232 ///
3233 /// \headerfile <x86intrin.h>
3234 ///
3235 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3236 ///
3237 /// \param __a
3238 /// A 128-bit integer vector.
3239 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3240 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3241  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3242 }
3243 
3244 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3249 ///
3250 /// \param __a
3251 /// A 128-bit vector of [4 x float].
3252 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3253 /// values.
3254 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3255  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3256 }
3257 
3258 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3259 /// truncating the result when it is inexact.
3260 ///
3261 /// \headerfile <x86intrin.h>
3262 ///
3263 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3264 /// instruction.
3265 ///
3266 /// \param __a
3267 /// A 128-bit vector of [4 x float].
3268 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3270  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3271 }
3272 
3273 /// Returns a vector of [4 x i32] where the lowest element is the input
3274 /// operand and the remaining elements are zero.
3275 ///
3276 /// \headerfile <x86intrin.h>
3277 ///
3278 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3279 ///
3280 /// \param __a
3281 /// A 32-bit signed integer operand.
3282 /// \returns A 128-bit vector of [4 x i32].
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3284  return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3285 }
3286 
3287 /// Returns a vector of [2 x i64] where the lower element is the input
3288 /// operand and the upper element is zero.
3289 ///
3290 /// \headerfile <x86intrin.h>
3291 ///
3292 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3293 /// in 64-bit mode.
3294 ///
3295 /// \param __a
3296 /// A 64-bit signed integer operand containing the value to be converted.
3297 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3298 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3299  return __extension__(__m128i)(__v2di){__a, 0};
3300 }
3301 
3302 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3303 /// 32-bit signed integer value.
3304 ///
3305 /// \headerfile <x86intrin.h>
3306 ///
3307 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3308 ///
3309 /// \param __a
3310 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3311 /// destination.
3312 /// \returns A 32-bit signed integer containing the moved value.
3313 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3314  __v4si __b = (__v4si)__a;
3315  return __b[0];
3316 }
3317 
3318 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3319 /// 64-bit signed integer value.
3320 ///
3321 /// \headerfile <x86intrin.h>
3322 ///
3323 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3324 ///
3325 /// \param __a
3326 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3327 /// destination.
3328 /// \returns A 64-bit signed integer containing the moved value.
3329 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3330  return __a[0];
3331 }
3332 
3333 /// Moves packed integer values from an aligned 128-bit memory location
3334 /// to elements in a 128-bit integer vector.
3335 ///
3336 /// \headerfile <x86intrin.h>
3337 ///
3338 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3339 ///
3340 /// \param __p
3341 /// An aligned pointer to a memory location containing integer values.
3342 /// \returns A 128-bit integer vector containing the moved values.
3343 static __inline__ __m128i __DEFAULT_FN_ATTRS
3344 _mm_load_si128(__m128i const *__p) {
3345  return *__p;
3346 }
3347 
3348 /// Moves packed integer values from an unaligned 128-bit memory location
3349 /// to elements in a 128-bit integer vector.
3350 ///
3351 /// \headerfile <x86intrin.h>
3352 ///
3353 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3354 ///
3355 /// \param __p
3356 /// A pointer to a memory location containing integer values.
3357 /// \returns A 128-bit integer vector containing the moved values.
3358 static __inline__ __m128i __DEFAULT_FN_ATTRS
3359 _mm_loadu_si128(__m128i_u const *__p) {
3360  struct __loadu_si128 {
3361  __m128i_u __v;
3362  } __attribute__((__packed__, __may_alias__));
3363  return ((const struct __loadu_si128 *)__p)->__v;
3364 }
3365 
3366 /// Returns a vector of [2 x i64] where the lower element is taken from
3367 /// the lower element of the operand, and the upper element is zero.
3368 ///
3369 /// \headerfile <x86intrin.h>
3370 ///
3371 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3372 ///
3373 /// \param __p
3374 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3375 /// the destination.
3376 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3377 /// moved value. The higher order bits are cleared.
3378 static __inline__ __m128i __DEFAULT_FN_ATTRS
3379 _mm_loadl_epi64(__m128i_u const *__p) {
3380  struct __mm_loadl_epi64_struct {
3381  long long __u;
3382  } __attribute__((__packed__, __may_alias__));
3383  return __extension__(__m128i){
3384  ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3385 }
3386 
3387 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3388 /// This could be used as an argument to another intrinsic function where the
3389 /// argument is required but the value is not actually used.
3390 ///
3391 /// \headerfile <x86intrin.h>
3392 ///
3393 /// This intrinsic has no corresponding instruction.
3394 ///
3395 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3397  return (__m128i)__builtin_ia32_undef128();
3398 }
3399 
3400 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3401 /// the specified 64-bit integer values.
3402 ///
3403 /// \headerfile <x86intrin.h>
3404 ///
3405 /// This intrinsic is a utility function and does not correspond to a specific
3406 /// instruction.
3407 ///
3408 /// \param __q1
3409 /// A 64-bit integer value used to initialize the upper 64 bits of the
3410 /// destination vector of [2 x i64].
3411 /// \param __q0
3412 /// A 64-bit integer value used to initialize the lower 64 bits of the
3413 /// destination vector of [2 x i64].
3414 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3415 /// provided in the operands.
3416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3417  long long __q0) {
3418  return __extension__(__m128i)(__v2di){__q0, __q1};
3419 }
3420 
3421 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3422 /// the specified 64-bit integer values.
3423 ///
3424 /// \headerfile <x86intrin.h>
3425 ///
3426 /// This intrinsic is a utility function and does not correspond to a specific
3427 /// instruction.
3428 ///
3429 /// \param __q1
3430 /// A 64-bit integer value used to initialize the upper 64 bits of the
3431 /// destination vector of [2 x i64].
3432 /// \param __q0
3433 /// A 64-bit integer value used to initialize the lower 64 bits of the
3434 /// destination vector of [2 x i64].
3435 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3436 /// provided in the operands.
3437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3438  __m64 __q0) {
3439  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3440 }
3441 
3442 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3443 /// the specified 32-bit integer values.
3444 ///
3445 /// \headerfile <x86intrin.h>
3446 ///
3447 /// This intrinsic is a utility function and does not correspond to a specific
3448 /// instruction.
3449 ///
3450 /// \param __i3
3451 /// A 32-bit integer value used to initialize bits [127:96] of the
3452 /// destination vector.
3453 /// \param __i2
3454 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3455 /// vector.
3456 /// \param __i1
3457 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3458 /// vector.
3459 /// \param __i0
3460 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3461 /// vector.
3462 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3463 /// provided in the operands.
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3465  int __i1, int __i0) {
3466  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3467 }
3468 
3469 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3470 /// the specified 16-bit integer values.
3471 ///
3472 /// \headerfile <x86intrin.h>
3473 ///
3474 /// This intrinsic is a utility function and does not correspond to a specific
3475 /// instruction.
3476 ///
3477 /// \param __w7
3478 /// A 16-bit integer value used to initialize bits [127:112] of the
3479 /// destination vector.
3480 /// \param __w6
3481 /// A 16-bit integer value used to initialize bits [111:96] of the
3482 /// destination vector.
3483 /// \param __w5
3484 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3485 /// vector.
3486 /// \param __w4
3487 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3488 /// vector.
3489 /// \param __w3
3490 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3491 /// vector.
3492 /// \param __w2
3493 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3494 /// vector.
3495 /// \param __w1
3496 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3497 /// vector.
3498 /// \param __w0
3499 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3500 /// vector.
3501 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3502 /// provided in the operands.
3503 static __inline__ __m128i __DEFAULT_FN_ATTRS
3504 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3505  short __w2, short __w1, short __w0) {
3506  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3507  __w4, __w5, __w6, __w7};
3508 }
3509 
3510 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3511 /// the specified 8-bit integer values.
3512 ///
3513 /// \headerfile <x86intrin.h>
3514 ///
3515 /// This intrinsic is a utility function and does not correspond to a specific
3516 /// instruction.
3517 ///
3518 /// \param __b15
3519 /// Initializes bits [127:120] of the destination vector.
3520 /// \param __b14
3521 /// Initializes bits [119:112] of the destination vector.
3522 /// \param __b13
3523 /// Initializes bits [111:104] of the destination vector.
3524 /// \param __b12
3525 /// Initializes bits [103:96] of the destination vector.
3526 /// \param __b11
3527 /// Initializes bits [95:88] of the destination vector.
3528 /// \param __b10
3529 /// Initializes bits [87:80] of the destination vector.
3530 /// \param __b9
3531 /// Initializes bits [79:72] of the destination vector.
3532 /// \param __b8
3533 /// Initializes bits [71:64] of the destination vector.
3534 /// \param __b7
3535 /// Initializes bits [63:56] of the destination vector.
3536 /// \param __b6
3537 /// Initializes bits [55:48] of the destination vector.
3538 /// \param __b5
3539 /// Initializes bits [47:40] of the destination vector.
3540 /// \param __b4
3541 /// Initializes bits [39:32] of the destination vector.
3542 /// \param __b3
3543 /// Initializes bits [31:24] of the destination vector.
3544 /// \param __b2
3545 /// Initializes bits [23:16] of the destination vector.
3546 /// \param __b1
3547 /// Initializes bits [15:8] of the destination vector.
3548 /// \param __b0
3549 /// Initializes bits [7:0] of the destination vector.
3550 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3551 /// provided in the operands.
3552 static __inline__ __m128i __DEFAULT_FN_ATTRS
3553 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3554  char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3555  char __b4, char __b3, char __b2, char __b1, char __b0) {
3556  return __extension__(__m128i)(__v16qi){
3557  __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3558  __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3559 }
3560 
3561 /// Initializes both values in a 128-bit integer vector with the
3562 /// specified 64-bit integer value.
3563 ///
3564 /// \headerfile <x86intrin.h>
3565 ///
3566 /// This intrinsic is a utility function and does not correspond to a specific
3567 /// instruction.
3568 ///
3569 /// \param __q
3570 /// Integer value used to initialize the elements of the destination integer
3571 /// vector.
3572 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3573 /// elements containing the value provided in the operand.
3574 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3575  return _mm_set_epi64x(__q, __q);
3576 }
3577 
3578 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3579 /// specified 64-bit value.
3580 ///
3581 /// \headerfile <x86intrin.h>
3582 ///
3583 /// This intrinsic is a utility function and does not correspond to a specific
3584 /// instruction.
3585 ///
3586 /// \param __q
3587 /// A 64-bit value used to initialize the elements of the destination integer
3588 /// vector.
3589 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3590 /// containing the value provided in the operand.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3592  return _mm_set_epi64(__q, __q);
3593 }
3594 
3595 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3596 /// specified 32-bit value.
3597 ///
3598 /// \headerfile <x86intrin.h>
3599 ///
3600 /// This intrinsic is a utility function and does not correspond to a specific
3601 /// instruction.
3602 ///
3603 /// \param __i
3604 /// A 32-bit value used to initialize the elements of the destination integer
3605 /// vector.
3606 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3607 /// containing the value provided in the operand.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3609  return _mm_set_epi32(__i, __i, __i, __i);
3610 }
3611 
3612 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3613 /// specified 16-bit value.
3614 ///
3615 /// \headerfile <x86intrin.h>
3616 ///
3617 /// This intrinsic is a utility function and does not correspond to a specific
3618 /// instruction.
3619 ///
3620 /// \param __w
3621 /// A 16-bit value used to initialize the elements of the destination integer
3622 /// vector.
3623 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3624 /// containing the value provided in the operand.
3625 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3626  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3627 }
3628 
3629 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3630 /// specified 8-bit value.
3631 ///
3632 /// \headerfile <x86intrin.h>
3633 ///
3634 /// This intrinsic is a utility function and does not correspond to a specific
3635 /// instruction.
3636 ///
3637 /// \param __b
3638 /// An 8-bit value used to initialize the elements of the destination integer
3639 /// vector.
3640 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3641 /// containing the value provided in the operand.
3642 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3643  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3644  __b, __b, __b, __b, __b);
3645 }
3646 
3647 /// Constructs a 128-bit integer vector, initialized in reverse order
3648 /// with the specified 64-bit integral values.
3649 ///
3650 /// \headerfile <x86intrin.h>
3651 ///
3652 /// This intrinsic does not correspond to a specific instruction.
3653 ///
3654 /// \param __q0
3655 /// A 64-bit integral value used to initialize the lower 64 bits of the
3656 /// result.
3657 /// \param __q1
3658 /// A 64-bit integral value used to initialize the upper 64 bits of the
3659 /// result.
3660 /// \returns An initialized 128-bit integer vector.
3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3662  __m64 __q1) {
3663  return _mm_set_epi64(__q1, __q0);
3664 }
3665 
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 /// with the specified 32-bit integral values.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic is a utility function and does not correspond to a specific
3672 /// instruction.
3673 ///
3674 /// \param __i0
3675 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3676 /// \param __i1
3677 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3678 /// \param __i2
3679 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3680 /// \param __i3
3681 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3682 /// \returns An initialized 128-bit integer vector.
3683 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3684  int __i2,
3685  int __i3) {
3686  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3687 }
3688 
3689 /// Constructs a 128-bit integer vector, initialized in reverse order
3690 /// with the specified 16-bit integral values.
3691 ///
3692 /// \headerfile <x86intrin.h>
3693 ///
3694 /// This intrinsic is a utility function and does not correspond to a specific
3695 /// instruction.
3696 ///
3697 /// \param __w0
3698 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3699 /// \param __w1
3700 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3701 /// \param __w2
3702 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3703 /// \param __w3
3704 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3705 /// \param __w4
3706 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3707 /// \param __w5
3708 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3709 /// \param __w6
3710 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3711 /// \param __w7
3712 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3713 /// \returns An initialized 128-bit integer vector.
3714 static __inline__ __m128i __DEFAULT_FN_ATTRS
3715 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3716  short __w5, short __w6, short __w7) {
3717  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3718 }
3719 
3720 /// Constructs a 128-bit integer vector, initialized in reverse order
3721 /// with the specified 8-bit integral values.
3722 ///
3723 /// \headerfile <x86intrin.h>
3724 ///
3725 /// This intrinsic is a utility function and does not correspond to a specific
3726 /// instruction.
3727 ///
3728 /// \param __b0
3729 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3730 /// \param __b1
3731 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3732 /// \param __b2
3733 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3734 /// \param __b3
3735 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3736 /// \param __b4
3737 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3738 /// \param __b5
3739 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3740 /// \param __b6
3741 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3742 /// \param __b7
3743 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3744 /// \param __b8
3745 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3746 /// \param __b9
3747 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3748 /// \param __b10
3749 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3750 /// \param __b11
3751 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3752 /// \param __b12
3753 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3754 /// \param __b13
3755 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3756 /// \param __b14
3757 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3758 /// \param __b15
3759 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3760 /// \returns An initialized 128-bit integer vector.
3761 static __inline__ __m128i __DEFAULT_FN_ATTRS
3762 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3763  char __b6, char __b7, char __b8, char __b9, char __b10,
3764  char __b11, char __b12, char __b13, char __b14, char __b15) {
3765  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3766  __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3767 }
3768 
3769 /// Creates a 128-bit integer vector initialized to zero.
3770 ///
3771 /// \headerfile <x86intrin.h>
3772 ///
3773 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3774 ///
3775 /// \returns An initialized 128-bit integer vector with all elements set to
3776 /// zero.
3777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3778  return __extension__(__m128i)(__v2di){0LL, 0LL};
3779 }
3780 
3781 /// Stores a 128-bit integer vector to a memory location aligned on a
3782 /// 128-bit boundary.
3783 ///
3784 /// \headerfile <x86intrin.h>
3785 ///
3786 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3787 ///
3788 /// \param __p
3789 /// A pointer to an aligned memory location that will receive the integer
3790 /// values.
3791 /// \param __b
3792 /// A 128-bit integer vector containing the values to be moved.
3793 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3794  __m128i __b) {
3795  *__p = __b;
3796 }
3797 
3798 /// Stores a 128-bit integer vector to an unaligned memory location.
3799 ///
3800 /// \headerfile <x86intrin.h>
3801 ///
3802 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3803 ///
3804 /// \param __p
3805 /// A pointer to a memory location that will receive the integer values.
3806 /// \param __b
3807 /// A 128-bit integer vector containing the values to be moved.
3808 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3809  __m128i __b) {
3810  struct __storeu_si128 {
3811  __m128i_u __v;
3812  } __attribute__((__packed__, __may_alias__));
3813  ((struct __storeu_si128 *)__p)->__v = __b;
3814 }
3815 
3816 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3817 /// vector.
3818 ///
3819 /// \headerfile <x86intrin.h>
3820 ///
3821 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3822 ///
3823 /// \param __p
3824 /// A pointer to a 64-bit memory location. The address of the memory
3825 /// location does not have to be aligned.
3826 /// \param __b
3827 /// A 128-bit integer vector containing the value to be stored.
3828 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3829  __m128i __b) {
3830  struct __storeu_si64 {
3831  long long __v;
3832  } __attribute__((__packed__, __may_alias__));
3833  ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3834 }
3835 
3836 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3837 /// vector.
3838 ///
3839 /// \headerfile <x86intrin.h>
3840 ///
3841 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3842 ///
3843 /// \param __p
3844 /// A pointer to a 32-bit memory location. The address of the memory
3845 /// location does not have to be aligned.
3846 /// \param __b
3847 /// A 128-bit integer vector containing the value to be stored.
3848 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3849  __m128i __b) {
3850  struct __storeu_si32 {
3851  int __v;
3852  } __attribute__((__packed__, __may_alias__));
3853  ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3854 }
3855 
3856 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3857 /// vector.
3858 ///
3859 /// \headerfile <x86intrin.h>
3860 ///
3861 /// This intrinsic does not correspond to a specific instruction.
3862 ///
3863 /// \param __p
3864 /// A pointer to a 16-bit memory location. The address of the memory
3865 /// location does not have to be aligned.
3866 /// \param __b
3867 /// A 128-bit integer vector containing the value to be stored.
3868 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3869  __m128i __b) {
3870  struct __storeu_si16 {
3871  short __v;
3872  } __attribute__((__packed__, __may_alias__));
3873  ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3874 }
3875 
3876 /// Moves bytes selected by the mask from the first operand to the
3877 /// specified unaligned memory location. When a mask bit is 1, the
3878 /// corresponding byte is written, otherwise it is not written.
3879 ///
3880 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3881 /// used again soon). Exception and trap behavior for elements not selected
3882 /// for storage to memory are implementation dependent.
3883 ///
3884 /// \headerfile <x86intrin.h>
3885 ///
3886 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3887 /// instruction.
3888 ///
3889 /// \param __d
3890 /// A 128-bit integer vector containing the values to be moved.
3891 /// \param __n
3892 /// A 128-bit integer vector containing the mask. The most significant bit of
3893 /// each byte represents the mask bits.
3894 /// \param __p
3895 /// A pointer to an unaligned 128-bit memory location where the specified
3896 /// values are moved.
3897 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3898  __m128i __n,
3899  char *__p) {
3900  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3901 }
3902 
3903 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3904 /// a memory location.
3905 ///
3906 /// \headerfile <x86intrin.h>
3907 ///
3908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3909 ///
3910 /// \param __p
3911 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3912 /// of the integer vector parameter.
3913 /// \param __a
3914 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3915 /// value to be stored.
3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3917  __m128i __a) {
3918  struct __mm_storel_epi64_struct {
3919  long long __u;
3920  } __attribute__((__packed__, __may_alias__));
3921  ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3922 }
3923 
3924 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3925 /// aligned memory location.
3926 ///
3927 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3928 /// used again soon).
3929 ///
3930 /// \headerfile <x86intrin.h>
3931 ///
3932 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3933 ///
3934 /// \param __p
3935 /// A pointer to the 128-bit aligned memory location used to store the value.
3936 /// \param __a
3937 /// A vector of [2 x double] containing the 64-bit values to be stored.
3938 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
3939  __m128d __a) {
3940  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3941 }
3942 
3943 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3944 ///
3945 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3946 /// used again soon).
3947 ///
3948 /// \headerfile <x86intrin.h>
3949 ///
3950 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3951 ///
3952 /// \param __p
3953 /// A pointer to the 128-bit aligned memory location used to store the value.
3954 /// \param __a
3955 /// A 128-bit integer vector containing the values to be stored.
3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
3957  __m128i __a) {
3958  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3959 }
3960 
3961 /// Stores a 32-bit integer value in the specified memory location.
3962 ///
3963 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3964 /// used again soon).
3965 ///
3966 /// \headerfile <x86intrin.h>
3967 ///
3968 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3969 ///
3970 /// \param __p
3971 /// A pointer to the 32-bit memory location used to store the value.
3972 /// \param __a
3973 /// A 32-bit integer containing the value to be stored.
3974 static __inline__ void
3975  __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3976  _mm_stream_si32(int *__p, int __a) {
3977  __builtin_ia32_movnti(__p, __a);
3978 }
3979 
3980 #ifdef __x86_64__
3981 /// Stores a 64-bit integer value in the specified memory location.
3982 ///
3983 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3984 /// used again soon).
3985 ///
3986 /// \headerfile <x86intrin.h>
3987 ///
3988 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3989 ///
3990 /// \param __p
3991 /// A pointer to the 64-bit memory location used to store the value.
3992 /// \param __a
3993 /// A 64-bit integer containing the value to be stored.
3994 static __inline__ void
3995  __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3996  _mm_stream_si64(long long *__p, long long __a) {
3997  __builtin_ia32_movnti64(__p, __a);
3998 }
3999 #endif
4000 
4001 #if defined(__cplusplus)
4002 extern "C" {
4003 #endif
4004 
4005 /// The cache line containing \a __p is flushed and invalidated from all
4006 /// caches in the coherency domain.
4007 ///
4008 /// \headerfile <x86intrin.h>
4009 ///
4010 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4011 ///
4012 /// \param __p
4013 /// A pointer to the memory location used to identify the cache line to be
4014 /// flushed.
4015 void _mm_clflush(void const *__p);
4016 
4017 /// Forces strong memory ordering (serialization) between load
4018 /// instructions preceding this instruction and load instructions following
4019 /// this instruction, ensuring the system completes all previous loads before
4020 /// executing subsequent loads.
4021 ///
4022 /// \headerfile <x86intrin.h>
4023 ///
4024 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4025 ///
4026 void _mm_lfence(void);
4027 
4028 /// Forces strong memory ordering (serialization) between load and store
4029 /// instructions preceding this instruction and load and store instructions
4030 /// following this instruction, ensuring that the system completes all
4031 /// previous memory accesses before executing subsequent memory accesses.
4032 ///
4033 /// \headerfile <x86intrin.h>
4034 ///
4035 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4036 ///
4037 void _mm_mfence(void);
4038 
4039 #if defined(__cplusplus)
4040 } // extern "C"
4041 #endif
4042 
4043 /// Converts 16-bit signed integers from both 128-bit integer vector
4044 /// operands into 8-bit signed integers, and packs the results into the
4045 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4046 /// Negative values less than 0x80 are saturated to 0x80.
4047 ///
4048 /// \headerfile <x86intrin.h>
4049 ///
4050 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4051 ///
4052 /// \param __a
4053 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4054 /// a signed integer and is converted to a 8-bit signed integer with
4055 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4056 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4057 /// written to the lower 64 bits of the result.
4058 /// \param __b
4059 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4060 /// a signed integer and is converted to a 8-bit signed integer with
4061 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4062 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4063 /// written to the higher 64 bits of the result.
4064 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4065 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4066  __m128i __b) {
4067  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4068 }
4069 
4070 /// Converts 32-bit signed integers from both 128-bit integer vector
4071 /// operands into 16-bit signed integers, and packs the results into the
4072 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4073 /// Negative values less than 0x8000 are saturated to 0x8000.
4074 ///
4075 /// \headerfile <x86intrin.h>
4076 ///
4077 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4078 ///
4079 /// \param __a
4080 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4081 /// a signed integer and is converted to a 16-bit signed integer with
4082 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4083 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4084 /// are written to the lower 64 bits of the result.
4085 /// \param __b
4086 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4087 /// a signed integer and is converted to a 16-bit signed integer with
4088 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4089 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4090 /// are written to the higher 64 bits of the result.
4091 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4093  __m128i __b) {
4094  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4095 }
4096 
4097 /// Converts 16-bit signed integers from both 128-bit integer vector
4098 /// operands into 8-bit unsigned integers, and packs the results into the
4099 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4100 /// than 0x00 are saturated to 0x00.
4101 ///
4102 /// \headerfile <x86intrin.h>
4103 ///
4104 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4105 ///
4106 /// \param __a
4107 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4108 /// a signed integer and is converted to an 8-bit unsigned integer with
4109 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4111 /// written to the lower 64 bits of the result.
4112 /// \param __b
4113 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4114 /// a signed integer and is converted to an 8-bit unsigned integer with
4115 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4116 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4117 /// written to the higher 64 bits of the result.
4118 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4119 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4120  __m128i __b) {
4121  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4122 }
4123 
4124 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4125 /// the immediate-value parameter as a selector.
4126 ///
4127 /// \headerfile <x86intrin.h>
4128 ///
4129 /// \code
4130 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4131 /// \endcode
4132 ///
4133 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4134 ///
4135 /// \param a
4136 /// A 128-bit integer vector.
4137 /// \param imm
4138 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4139 /// to bits[15:0] of the result. \n
4140 /// 000: assign values from bits [15:0] of \a a. \n
4141 /// 001: assign values from bits [31:16] of \a a. \n
4142 /// 010: assign values from bits [47:32] of \a a. \n
4143 /// 011: assign values from bits [63:48] of \a a. \n
4144 /// 100: assign values from bits [79:64] of \a a. \n
4145 /// 101: assign values from bits [95:80] of \a a. \n
4146 /// 110: assign values from bits [111:96] of \a a. \n
4147 /// 111: assign values from bits [127:112] of \a a.
4148 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4149 /// integer vector parameter and the remaining bits are assigned zeros.
4150 #define _mm_extract_epi16(a, imm) \
4151  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4152  (int)(imm)))
4153 
4154 /// Constructs a 128-bit integer vector by first making a copy of the
4155 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4156 /// of an integer parameter into an offset specified by the immediate-value
4157 /// parameter.
4158 ///
4159 /// \headerfile <x86intrin.h>
4160 ///
4161 /// \code
4162 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4163 /// \endcode
4164 ///
4165 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4166 ///
4167 /// \param a
4168 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4169 /// result and then one of the eight elements in the result is replaced by
4170 /// the lower 16 bits of \a b.
4171 /// \param b
4172 /// An integer. The lower 16 bits of this parameter are written to the
4173 /// result beginning at an offset specified by \a imm.
4174 /// \param imm
4175 /// An immediate value specifying the bit offset in the result at which the
4176 /// lower 16 bits of \a b are written.
4177 /// \returns A 128-bit integer vector containing the constructed values.
4178 #define _mm_insert_epi16(a, b, imm) \
4179  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4180  (int)(imm)))
4181 
4182 /// Copies the values of the most significant bits from each 8-bit
4183 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4184 /// value, zero-extends the value, and writes it to the destination.
4185 ///
4186 /// \headerfile <x86intrin.h>
4187 ///
4188 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4189 ///
4190 /// \param __a
4191 /// A 128-bit integer vector containing the values with bits to be extracted.
4192 /// \returns The most significant bits from each 8-bit element in \a __a,
4193 /// written to bits [15:0]. The other bits are assigned zeros.
4194 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4195  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4196 }
4197 
4198 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4199 /// elements of a 128-bit integer vector parameter, using the immediate-value
4200 /// parameter as a specifier.
4201 ///
4202 /// \headerfile <x86intrin.h>
4203 ///
4204 /// \code
4205 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4206 /// \endcode
4207 ///
4208 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4209 ///
4210 /// \param a
4211 /// A 128-bit integer vector containing the values to be copied.
4212 /// \param imm
4213 /// An immediate value containing an 8-bit value specifying which elements to
4214 /// copy from a. The destinations within the 128-bit destination are assigned
4215 /// values as follows: \n
4216 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4217 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4218 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4219 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4220 /// Bit value assignments: \n
4221 /// 00: assign values from bits [31:0] of \a a. \n
4222 /// 01: assign values from bits [63:32] of \a a. \n
4223 /// 10: assign values from bits [95:64] of \a a. \n
4224 /// 11: assign values from bits [127:96] of \a a. \n
4225 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4226 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4227 /// <c>[b6, b4, b2, b0]</c>.
4228 /// \returns A 128-bit integer vector containing the shuffled values.
4229 #define _mm_shuffle_epi32(a, imm) \
4230  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4231 
4232 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4233 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4234 /// value parameter as a specifier.
4235 ///
4236 /// \headerfile <x86intrin.h>
4237 ///
4238 /// \code
4239 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4240 /// \endcode
4241 ///
4242 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4243 ///
4244 /// \param a
4245 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4246 /// [127:64] of the result.
4247 /// \param imm
4248 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4249 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4250 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4251 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4252 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4253 /// Bit value assignments: \n
4254 /// 00: assign values from bits [15:0] of \a a. \n
4255 /// 01: assign values from bits [31:16] of \a a. \n
4256 /// 10: assign values from bits [47:32] of \a a. \n
4257 /// 11: assign values from bits [63:48] of \a a. \n
4258 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4259 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4260 /// <c>[b6, b4, b2, b0]</c>.
4261 /// \returns A 128-bit integer vector containing the shuffled values.
4262 #define _mm_shufflelo_epi16(a, imm) \
4263  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4264 
4265 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4266 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4267 /// value parameter as a specifier.
4268 ///
4269 /// \headerfile <x86intrin.h>
4270 ///
4271 /// \code
4272 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4273 /// \endcode
4274 ///
4275 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4276 ///
4277 /// \param a
4278 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4279 /// [63:0] of the result.
4280 /// \param imm
4281 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4282 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4283 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4284 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4285 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4286 /// Bit value assignments: \n
4287 /// 00: assign values from bits [79:64] of \a a. \n
4288 /// 01: assign values from bits [95:80] of \a a. \n
4289 /// 10: assign values from bits [111:96] of \a a. \n
4290 /// 11: assign values from bits [127:112] of \a a. \n
4291 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4292 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4293 /// <c>[b6, b4, b2, b0]</c>.
4294 /// \returns A 128-bit integer vector containing the shuffled values.
4295 #define _mm_shufflehi_epi16(a, imm) \
4296  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4297 
4298 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4299 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4300 ///
4301 /// \headerfile <x86intrin.h>
4302 ///
4303 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4304 /// instruction.
4305 ///
4306 /// \param __a
4307 /// A 128-bit vector of [16 x i8].
4308 /// Bits [71:64] are written to bits [7:0] of the result. \n
4309 /// Bits [79:72] are written to bits [23:16] of the result. \n
4310 /// Bits [87:80] are written to bits [39:32] of the result. \n
4311 /// Bits [95:88] are written to bits [55:48] of the result. \n
4312 /// Bits [103:96] are written to bits [71:64] of the result. \n
4313 /// Bits [111:104] are written to bits [87:80] of the result. \n
4314 /// Bits [119:112] are written to bits [103:96] of the result. \n
4315 /// Bits [127:120] are written to bits [119:112] of the result.
4316 /// \param __b
4317 /// A 128-bit vector of [16 x i8]. \n
4318 /// Bits [71:64] are written to bits [15:8] of the result. \n
4319 /// Bits [79:72] are written to bits [31:24] of the result. \n
4320 /// Bits [87:80] are written to bits [47:40] of the result. \n
4321 /// Bits [95:88] are written to bits [63:56] of the result. \n
4322 /// Bits [103:96] are written to bits [79:72] of the result. \n
4323 /// Bits [111:104] are written to bits [95:88] of the result. \n
4324 /// Bits [119:112] are written to bits [111:104] of the result. \n
4325 /// Bits [127:120] are written to bits [127:120] of the result.
4326 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4327 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4328  __m128i __b) {
4329  return (__m128i)__builtin_shufflevector(
4330  (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4331  16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4332 }
4333 
4334 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4335 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4336 ///
4337 /// \headerfile <x86intrin.h>
4338 ///
4339 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4340 /// instruction.
4341 ///
4342 /// \param __a
4343 /// A 128-bit vector of [8 x i16].
4344 /// Bits [79:64] are written to bits [15:0] of the result. \n
4345 /// Bits [95:80] are written to bits [47:32] of the result. \n
4346 /// Bits [111:96] are written to bits [79:64] of the result. \n
4347 /// Bits [127:112] are written to bits [111:96] of the result.
4348 /// \param __b
4349 /// A 128-bit vector of [8 x i16].
4350 /// Bits [79:64] are written to bits [31:16] of the result. \n
4351 /// Bits [95:80] are written to bits [63:48] of the result. \n
4352 /// Bits [111:96] are written to bits [95:80] of the result. \n
4353 /// Bits [127:112] are written to bits [127:112] of the result.
4354 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4355 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4356  __m128i __b) {
4357  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4358  8 + 5, 6, 8 + 6, 7, 8 + 7);
4359 }
4360 
4361 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4362 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4363 ///
4364 /// \headerfile <x86intrin.h>
4365 ///
4366 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4367 /// instruction.
4368 ///
4369 /// \param __a
4370 /// A 128-bit vector of [4 x i32]. \n
4371 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4372 /// Bits [127:96] are written to bits [95:64] of the destination.
4373 /// \param __b
4374 /// A 128-bit vector of [4 x i32]. \n
4375 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4376 /// Bits [127:96] are written to bits [127:96] of the destination.
4377 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4379  __m128i __b) {
4380  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4381  4 + 3);
4382 }
4383 
4384 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4385 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4386 ///
4387 /// \headerfile <x86intrin.h>
4388 ///
4389 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4390 /// instruction.
4391 ///
4392 /// \param __a
4393 /// A 128-bit vector of [2 x i64]. \n
4394 /// Bits [127:64] are written to bits [63:0] of the destination.
4395 /// \param __b
4396 /// A 128-bit vector of [2 x i64]. \n
4397 /// Bits [127:64] are written to bits [127:64] of the destination.
4398 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4400  __m128i __b) {
4401  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4402 }
4403 
4404 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4405 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4406 ///
4407 /// \headerfile <x86intrin.h>
4408 ///
4409 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4410 /// instruction.
4411 ///
4412 /// \param __a
4413 /// A 128-bit vector of [16 x i8]. \n
4414 /// Bits [7:0] are written to bits [7:0] of the result. \n
4415 /// Bits [15:8] are written to bits [23:16] of the result. \n
4416 /// Bits [23:16] are written to bits [39:32] of the result. \n
4417 /// Bits [31:24] are written to bits [55:48] of the result. \n
4418 /// Bits [39:32] are written to bits [71:64] of the result. \n
4419 /// Bits [47:40] are written to bits [87:80] of the result. \n
4420 /// Bits [55:48] are written to bits [103:96] of the result. \n
4421 /// Bits [63:56] are written to bits [119:112] of the result.
4422 /// \param __b
4423 /// A 128-bit vector of [16 x i8].
4424 /// Bits [7:0] are written to bits [15:8] of the result. \n
4425 /// Bits [15:8] are written to bits [31:24] of the result. \n
4426 /// Bits [23:16] are written to bits [47:40] of the result. \n
4427 /// Bits [31:24] are written to bits [63:56] of the result. \n
4428 /// Bits [39:32] are written to bits [79:72] of the result. \n
4429 /// Bits [47:40] are written to bits [95:88] of the result. \n
4430 /// Bits [55:48] are written to bits [111:104] of the result. \n
4431 /// Bits [63:56] are written to bits [127:120] of the result.
4432 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4433 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4434  __m128i __b) {
4435  return (__m128i)__builtin_shufflevector(
4436  (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4437  16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4438 }
4439 
4440 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4441 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4442 /// [8 x i16].
4443 ///
4444 /// \headerfile <x86intrin.h>
4445 ///
4446 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4447 /// instruction.
4448 ///
4449 /// \param __a
4450 /// A 128-bit vector of [8 x i16].
4451 /// Bits [15:0] are written to bits [15:0] of the result. \n
4452 /// Bits [31:16] are written to bits [47:32] of the result. \n
4453 /// Bits [47:32] are written to bits [79:64] of the result. \n
4454 /// Bits [63:48] are written to bits [111:96] of the result.
4455 /// \param __b
4456 /// A 128-bit vector of [8 x i16].
4457 /// Bits [15:0] are written to bits [31:16] of the result. \n
4458 /// Bits [31:16] are written to bits [63:48] of the result. \n
4459 /// Bits [47:32] are written to bits [95:80] of the result. \n
4460 /// Bits [63:48] are written to bits [127:112] of the result.
4461 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4462 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4463  __m128i __b) {
4464  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4465  8 + 1, 2, 8 + 2, 3, 8 + 3);
4466 }
4467 
4468 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4469 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4470 ///
4471 /// \headerfile <x86intrin.h>
4472 ///
4473 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4474 /// instruction.
4475 ///
4476 /// \param __a
4477 /// A 128-bit vector of [4 x i32]. \n
4478 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4479 /// Bits [63:32] are written to bits [95:64] of the destination.
4480 /// \param __b
4481 /// A 128-bit vector of [4 x i32]. \n
4482 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4483 /// Bits [63:32] are written to bits [127:96] of the destination.
4484 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4485 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4486  __m128i __b) {
4487  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4488  4 + 1);
4489 }
4490 
4491 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4492 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4493 ///
4494 /// \headerfile <x86intrin.h>
4495 ///
4496 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4497 /// instruction.
4498 ///
4499 /// \param __a
4500 /// A 128-bit vector of [2 x i64]. \n
4501 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4502 /// \param __b
4503 /// A 128-bit vector of [2 x i64]. \n
4504 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4505 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4507  __m128i __b) {
4508  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4509 }
4510 
4511 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4512 /// integer.
4513 ///
4514 /// \headerfile <x86intrin.h>
4515 ///
4516 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4517 ///
4518 /// \param __a
4519 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4520 /// destination.
4521 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4522 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4523  return (__m64)__a[0];
4524 }
4525 
4526 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4527 /// upper bits.
4528 ///
4529 /// \headerfile <x86intrin.h>
4530 ///
4531 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4532 ///
4533 /// \param __a
4534 /// A 64-bit value.
4535 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4536 /// the operand. The upper 64 bits are assigned zeros.
4537 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4538  return __extension__(__m128i)(__v2di){(long long)__a, 0};
4539 }
4540 
4541 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4542 /// integer vector, zeroing the upper bits.
4543 ///
4544 /// \headerfile <x86intrin.h>
4545 ///
4546 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4547 ///
4548 /// \param __a
4549 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4550 /// destination.
4551 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4552 /// the operand. The upper 64 bits are assigned zeros.
4553 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4554  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4555 }
4556 
4557 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4558 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4559 /// double].
4560 ///
4561 /// \headerfile <x86intrin.h>
4562 ///
4563 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4564 ///
4565 /// \param __a
4566 /// A 128-bit vector of [2 x double]. \n
4567 /// Bits [127:64] are written to bits [63:0] of the destination.
4568 /// \param __b
4569 /// A 128-bit vector of [2 x double]. \n
4570 /// Bits [127:64] are written to bits [127:64] of the destination.
4571 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4572 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4573  __m128d __b) {
4574  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4575 }
4576 
4577 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4578 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4579 /// double].
4580 ///
4581 /// \headerfile <x86intrin.h>
4582 ///
4583 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4584 ///
4585 /// \param __a
4586 /// A 128-bit vector of [2 x double]. \n
4587 /// Bits [63:0] are written to bits [63:0] of the destination.
4588 /// \param __b
4589 /// A 128-bit vector of [2 x double]. \n
4590 /// Bits [63:0] are written to bits [127:64] of the destination.
4591 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4592 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4593  __m128d __b) {
4594  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4595 }
4596 
4597 /// Extracts the sign bits of the double-precision values in the 128-bit
4598 /// vector of [2 x double], zero-extends the value, and writes it to the
4599 /// low-order bits of the destination.
4600 ///
4601 /// \headerfile <x86intrin.h>
4602 ///
4603 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4604 ///
4605 /// \param __a
4606 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4607 /// be extracted.
4608 /// \returns The sign bits from each of the double-precision elements in \a __a,
4609 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4610 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4611  return __builtin_ia32_movmskpd((__v2df)__a);
4612 }
4613 
4614 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4615 /// 128-bit vector parameters of [2 x double], using the immediate-value
4616 /// parameter as a specifier.
4617 ///
4618 /// \headerfile <x86intrin.h>
4619 ///
4620 /// \code
4621 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4622 /// \endcode
4623 ///
4624 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4625 ///
4626 /// \param a
4627 /// A 128-bit vector of [2 x double].
4628 /// \param b
4629 /// A 128-bit vector of [2 x double].
4630 /// \param i
4631 /// An 8-bit immediate value. The least significant two bits specify which
4632 /// elements to copy from \a a and \a b: \n
4633 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4634 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4635 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4636 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4637 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4638 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4639 /// <c>[b1, b0]</c>.
4640 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4641 #define _mm_shuffle_pd(a, b, i) \
4642  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4643  (int)(i)))
4644 
4645 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4646 /// floating-point vector of [4 x float].
4647 ///
4648 /// \headerfile <x86intrin.h>
4649 ///
4650 /// This intrinsic has no corresponding instruction.
4651 ///
4652 /// \param __a
4653 /// A 128-bit floating-point vector of [2 x double].
4654 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4655 /// bitwise pattern as the parameter.
4656 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4657  return (__m128)__a;
4658 }
4659 
4660 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4661 /// integer vector.
4662 ///
4663 /// \headerfile <x86intrin.h>
4664 ///
4665 /// This intrinsic has no corresponding instruction.
4666 ///
4667 /// \param __a
4668 /// A 128-bit floating-point vector of [2 x double].
4669 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4670 /// parameter.
4671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4672  return (__m128i)__a;
4673 }
4674 
4675 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4676 /// floating-point vector of [2 x double].
4677 ///
4678 /// \headerfile <x86intrin.h>
4679 ///
4680 /// This intrinsic has no corresponding instruction.
4681 ///
4682 /// \param __a
4683 /// A 128-bit floating-point vector of [4 x float].
4684 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4685 /// bitwise pattern as the parameter.
4686 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4687  return (__m128d)__a;
4688 }
4689 
4690 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4691 /// integer vector.
4692 ///
4693 /// \headerfile <x86intrin.h>
4694 ///
4695 /// This intrinsic has no corresponding instruction.
4696 ///
4697 /// \param __a
4698 /// A 128-bit floating-point vector of [4 x float].
4699 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4700 /// parameter.
4701 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4702  return (__m128i)__a;
4703 }
4704 
4705 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4706 /// of [4 x float].
4707 ///
4708 /// \headerfile <x86intrin.h>
4709 ///
4710 /// This intrinsic has no corresponding instruction.
4711 ///
4712 /// \param __a
4713 /// A 128-bit integer vector.
4714 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4715 /// bitwise pattern as the parameter.
4716 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4717  return (__m128)__a;
4718 }
4719 
4720 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4721 /// of [2 x double].
4722 ///
4723 /// \headerfile <x86intrin.h>
4724 ///
4725 /// This intrinsic has no corresponding instruction.
4726 ///
4727 /// \param __a
4728 /// A 128-bit integer vector.
4729 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4730 /// bitwise pattern as the parameter.
4731 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4732  return (__m128d)__a;
4733 }
4734 
4735 #if defined(__cplusplus)
4736 extern "C" {
4737 #endif
4738 
4739 /// Indicates that a spin loop is being executed for the purposes of
4740 /// optimizing power consumption during the loop.
4741 ///
4742 /// \headerfile <x86intrin.h>
4743 ///
4744 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4745 ///
4746 void _mm_pause(void);
4747 
4748 #if defined(__cplusplus)
4749 } // extern "C"
4750 #endif
4751 #undef __DEFAULT_FN_ATTRS
4752 #undef __DEFAULT_FN_ATTRS_MMX
4753 
4754 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4755 
4756 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4757 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4758 
4759 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4760 
4761 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4762 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4763  (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4764 
4765 #endif /* __EMMINTRIN_H */
_mm_xor_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:397
_mm_set_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1777
_mm_load1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1523
_mm_cvtepi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1287
_mm_cvtpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1444
_mm_undefined_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1709
_mm_set_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3464
_mm_mul_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2399
__x
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:22
_mm_srli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2977
_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3013
_mm_cmpnle_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:884
_mm_add_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2027
_mm_loadl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3379
_mm_madd_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2230
_mm_unpacklo_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4592
_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
_mm_set_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3416
_mm_add_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2048
_mm_storeu_si32
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3848
_mm_loadu_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3359
_mm_sqrt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:226
_mm_cmpgt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:733
_mm_cmpgt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3110
_mm_subs_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2584
_mm_unpackhi_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4572
__v
struct __storeu_i16 *__P __v
Definition: immintrin.h:386
_mm_comige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1057
_mm_sub_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:104
_mm_srai_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2843
_mm_ucomigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1180
_mm_cmpunord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:812
_mm_storeu_si16
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3868
_mm_div_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:183
_mm_and_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2601
_mm_add_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2086
_mm_set_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3553
_mm_cvtsd_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1343
_mm_unpacklo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4485
_mm_cmpeq_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3031
_mm_set_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1727
_mm_max_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:326
_mm_subs_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2526
_mm_setr_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3661
_mm_sll_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2786
__a
static __inline__ void int __a
Definition: emmintrin.h:3976
_mm_ucomineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1230
_mm_ucomile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1155
_mm_castpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4656
_mm_mul_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:161
_mm_sub_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2506
_mm_set1_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3574
_mm_storel_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1982
__DEFAULT_FN_ATTRS_MMX
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:45
_mm_loadu_si16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1619
_mm_sra_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2862
_mm_storeu_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1924
_mm_unpackhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4355
_mm_load_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1638
_mm_undefined_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3396
_mm_store_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1868
_mm_packus_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4119
xmmintrin.h
_mm_srl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2995
_mm_setr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1797
_mm_cmpneq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:836
_mm_comineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1082
_mm_cvtsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1320
__DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:42
_mm_cmpgt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:474
_mm_max_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2268
_mm_cvttps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3269
_mm_maskmoveu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3897
_mm_cmpeq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:415
_mm_cmple_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:454
_mm_loadh_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1663
_mm_cmpge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:758
_mm_sqrt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:243
_mm_cvtps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3254
_mm_movepi64_pi64
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4522
_mm_min_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2287
_mm_xor_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2653
_mm_andnot_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:363
_mm_set1_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3591
_mm_cmpnge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:933
_mm_unpackhi_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4399
_mm_cvtps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1266
_mm_cvtsi32_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1365
_mm_add_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:64
_mm_setr_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3762
_mm_cmpeq_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3049
_mm_cmplt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:685
_mm_unpacklo_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4433
_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3642
_mm_cmple_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:709
_mm_cvttpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1413
_mm_mulhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2325
_mm_comile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1007
_mm_loadu_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1563
_mm_sra_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2824
_mm_cvtpi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1477
_mm_clflush
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
_mm_cmplt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3130
_mm_move_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4553
_mm_mulhi_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2344
_mm_cmpnle_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:599
_mm_avg_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2205
_mm_load_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3344
_mm_mullo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2363
_mm_cvtsi128_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3313
_mm_cmpeq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:661
_mm_move_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1830
_mm_sub_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2471
_mm_srl_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2923
_mm_cmpnlt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:860
_mm_cvtsi64_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3298
_mm_unpackhi_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4378
_mm_sub_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2489
_mm_sub_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2454
_mm_adds_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2167
_mm_slli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2768
_mm_sll_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2750
_mm_set_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3504
__attribute__
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
_mm_castsi128_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4716
_mm_storeh_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1963
_mm_srli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2941
_mm_unpacklo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4462
_mm_packs_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4092
_mm_cvtepi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3240
_mm_comieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:957
_mm_storeu_si64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3828
_mm_slli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2696
_mm_ucomieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1105
_mm_store_pd1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1907
_mm_setzero_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3777
_mm_castps_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4701
_mm_adds_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2147
_mm_div_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:202
_mm_or_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:380
_mm_andnot_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2620
_mm_unpacklo_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4506
_mm_cmpunord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:539
_mm_cvtpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1305
_mm_subs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2546
_mm_storeu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3808
_mm_mul_su32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2381
_mm_stream_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:3938
_mm_set1_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3625
_mm_loadl_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1689
_mm_slli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2732
_mm_cmpge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:494
_mm_sll_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2714
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
_mm_cvttpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1462
_mm_min_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:284
_mm_packs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition: emmintrin.h:4065
_mm_sad_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2420
_mm_comigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1032
_mm_comilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:982
_mm_storer_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1946
_mm_setr_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3715
_mm_cvtss_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1390
_mm_srai_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2805
_mm_cmplt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:434
_mm_castps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4686
_mm_movemask_pd
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4610
_mm_min_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:265
_mm_sub_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:122
_mm_cmpneq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:559
_mm_movpi64_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4537
_mm_cmpord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:516
_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4194
_mm_set1_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3608
_mm_set1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1743
_mm_store1_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1887
_mm_add_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:82
_mm_cvtsi128_si64
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3329
_mm_avg_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2186
_mm_or_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2636
_mm_adds_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2106
__b
static __inline__ vector float vector float __b
Definition: altivec.h:578
_mm_mfence
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
_mm_loadu_si64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1581
_mm_cmpnlt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:579
_mm_cmplt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3170
_mm_store_sd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1847
_mm_lfence
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
_mm_add_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2006
_mm_loadr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1547
_mm_cmpgt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3090
_mm_loadu_si32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1600
_mm_sub_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2437
_mm_cmpgt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3068
_mm_unpackhi_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4327
_mm_max_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:307
_mm_cvtpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1248
_mm_ucomilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1130
_mm_store_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3793
_mm_cvtsd_f64
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1492
__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4788
_mm_castpd_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4671
_mm_stream_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:3956
_mm_min_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2306
_mm_cmpnge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:639
_mm_subs_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2565
_mm_and_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:343
_mm_mul_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:143
_mm_adds_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2127
_mm_cmplt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3150
_mm_storel_epi64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3916
_mm_set_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3437
_mm_ucomige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1205
_mm_cvttsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1429
_mm_cvtsi32_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3283
_mm_cmpngt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:619
_mm_set_pd1
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1759
_mm_castsi128_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4731
_mm_setzero_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1811
_mm_add_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2065
_mm_srli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2905
_mm_srl_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2959
_mm_max_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2249
_mm_setr_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3683
_mm_cmpngt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:908
_mm_cmpord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:785
_mm_load_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1507