clang  17.0.0git
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24  __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46 
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50 
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS \
53  __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
54  __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX \
56  __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \
57  __min_vector_width__(64)))
58 
59 /// Adds lower double-precision values in both operands and returns the
60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
61 /// are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 /// A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 /// A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 /// from the upper 64 bits of the first source operand.
74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75  __m128d __b) {
76  __a[0] += __b[0];
77  return __a;
78 }
79 
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 /// A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 /// A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 /// operands.
92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93  __m128d __b) {
94  return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96 
97 /// Subtracts the lower double-precision value of the second operand
98 /// from the lower double-precision value of the first operand and returns
99 /// the difference in the lower 64 bits of the result. The upper 64 bits of
100 /// the result are copied from the upper double-precision value of the first
101 /// operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 /// A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 /// A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 /// difference of the lower 64 bits of both operands. The upper 64 bits are
113 /// copied from the upper 64 bits of the first source operand.
114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115  __m128d __b) {
116  __a[0] -= __b[0];
117  return __a;
118 }
119 
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 /// A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 /// A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 /// both operands.
132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133  __m128d __b) {
134  return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136 
137 /// Multiplies lower double-precision values in both operands and returns
138 /// the product in the lower 64 bits of the result. The upper 64 bits of the
139 /// result are copied from the upper double-precision value of the first
140 /// operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 /// A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 /// A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 /// product of the lower 64 bits of both operands. The upper 64 bits are
152 /// copied from the upper 64 bits of the first source operand.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154  __m128d __b) {
155  __a[0] *= __b[0];
156  return __a;
157 }
158 
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 /// operands.
171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172  __m128d __b) {
173  return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175 
176 /// Divides the lower double-precision value of the first operand by the
177 /// lower double-precision value of the second operand and returns the
178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
179 /// result are copied from the upper double-precision value of the first
180 /// operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 /// A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 /// A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
192 /// copied from the upper 64 bits of the first source operand.
193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194  __m128d __b) {
195  __a[0] /= __b[0];
196  return __a;
197 }
198 
199 /// Performs an element-by-element division of two 128-bit vectors of
200 /// [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 /// A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 /// operands.
212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213  __m128d __b) {
214  return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216 
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
229 /// result.
230 /// \param __b
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237  __m128d __b) {
238  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239  return __extension__(__m128d){__c[0], __a[1]};
240 }
241 
242 /// Calculates the square root of the each of two values stored in a
243 /// 128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 /// A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 /// values in the operand.
253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254  return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256 
257 /// Compares lower 64-bit double-precision values of both operands, and
258 /// returns the lesser of the pair of values in the lower 64-bits of the
259 /// result. The upper 64 bits of the result are copied from the upper
260 /// double-precision value of the first operand.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265 ///
266 /// \param __a
267 /// A 128-bit vector of [2 x double] containing one of the operands. The
268 /// lower 64 bits of this operand are used in the comparison.
269 /// \param __b
270 /// A 128-bit vector of [2 x double] containing one of the operands. The
271 /// lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 /// minimum value between both operands. The upper 64 bits are copied from
274 /// the upper 64 bits of the first source operand.
275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276  __m128d __b) {
277  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278 }
279 
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 /// [2 x double] and returns the vector containing the lesser of each pair of
282 /// values.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287 ///
288 /// \param __a
289 /// A 128-bit vector of [2 x double] containing one of the operands.
290 /// \param __b
291 /// A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 /// between both operands.
294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295  __m128d __b) {
296  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297 }
298 
299 /// Compares lower 64-bit double-precision values of both operands, and
300 /// returns the greater of the pair of values in the lower 64-bits of the
301 /// result. The upper 64 bits of the result are copied from the upper
302 /// double-precision value of the first operand.
303 ///
304 /// \headerfile <x86intrin.h>
305 ///
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307 ///
308 /// \param __a
309 /// A 128-bit vector of [2 x double] containing one of the operands. The
310 /// lower 64 bits of this operand are used in the comparison.
311 /// \param __b
312 /// A 128-bit vector of [2 x double] containing one of the operands. The
313 /// lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 /// maximum value between both operands. The upper 64 bits are copied from
316 /// the upper 64 bits of the first source operand.
317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318  __m128d __b) {
319  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320 }
321 
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 /// [2 x double] and returns the vector containing the greater of each pair
324 /// of values.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329 ///
330 /// \param __a
331 /// A 128-bit vector of [2 x double] containing one of the operands.
332 /// \param __b
333 /// A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 /// between both operands.
336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337  __m128d __b) {
338  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340 
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 /// A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 /// A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 /// values between both operands.
353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354  __m128d __b) {
355  return (__m128d)((__v2du)__a & (__v2du)__b);
356 }
357 
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 /// the one's complement of the values contained in the first source operand.
360 ///
361 /// \headerfile <x86intrin.h>
362 ///
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364 ///
365 /// \param __a
366 /// A 128-bit vector of [2 x double] containing the left source operand. The
367 /// one's complement of this value is used in the bitwise AND.
368 /// \param __b
369 /// A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 /// values in the second operand and the one's complement of the first
372 /// operand.
373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374  __m128d __b) {
375  return (__m128d)(~(__v2du)__a & (__v2du)__b);
376 }
377 
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379 ///
380 /// \headerfile <x86intrin.h>
381 ///
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383 ///
384 /// \param __a
385 /// A 128-bit vector of [2 x double] containing one of the source operands.
386 /// \param __b
387 /// A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 /// values between both operands.
390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391  __m128d __b) {
392  return (__m128d)((__v2du)__a | (__v2du)__b);
393 }
394 
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400 ///
401 /// \param __a
402 /// A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 /// A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 /// values between both operands.
407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408  __m128d __b) {
409  return (__m128d)((__v2du)__a ^ (__v2du)__b);
410 }
411 
412 /// Compares each of the corresponding double-precision values of the
413 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414 /// for false, 0xFFFFFFFFFFFFFFFF for true.
415 ///
416 /// \headerfile <x86intrin.h>
417 ///
418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419 ///
420 /// \param __a
421 /// A 128-bit vector of [2 x double].
422 /// \param __b
423 /// A 128-bit vector of [2 x double].
424 /// \returns A 128-bit vector containing the comparison results.
425 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426  __m128d __b) {
427  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
428 }
429 
430 /// Compares each of the corresponding double-precision values of the
431 /// 128-bit vectors of [2 x double] to determine if the values in the first
432 /// operand are less than those in the second operand. Each comparison
433 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
434 ///
435 /// \headerfile <x86intrin.h>
436 ///
437 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
438 ///
439 /// \param __a
440 /// A 128-bit vector of [2 x double].
441 /// \param __b
442 /// A 128-bit vector of [2 x double].
443 /// \returns A 128-bit vector containing the comparison results.
444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445  __m128d __b) {
446  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
447 }
448 
449 /// Compares each of the corresponding double-precision values of the
450 /// 128-bit vectors of [2 x double] to determine if the values in the first
451 /// operand are less than or equal to those in the second operand.
452 ///
453 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454 ///
455 /// \headerfile <x86intrin.h>
456 ///
457 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
458 ///
459 /// \param __a
460 /// A 128-bit vector of [2 x double].
461 /// \param __b
462 /// A 128-bit vector of [2 x double].
463 /// \returns A 128-bit vector containing the comparison results.
464 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465  __m128d __b) {
466  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
467 }
468 
469 /// Compares each of the corresponding double-precision values of the
470 /// 128-bit vectors of [2 x double] to determine if the values in the first
471 /// operand are greater than those in the second operand.
472 ///
473 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
474 ///
475 /// \headerfile <x86intrin.h>
476 ///
477 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
478 ///
479 /// \param __a
480 /// A 128-bit vector of [2 x double].
481 /// \param __b
482 /// A 128-bit vector of [2 x double].
483 /// \returns A 128-bit vector containing the comparison results.
484 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485  __m128d __b) {
486  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
487 }
488 
489 /// Compares each of the corresponding double-precision values of the
490 /// 128-bit vectors of [2 x double] to determine if the values in the first
491 /// operand are greater than or equal to those in the second operand.
492 ///
493 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
494 ///
495 /// \headerfile <x86intrin.h>
496 ///
497 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
498 ///
499 /// \param __a
500 /// A 128-bit vector of [2 x double].
501 /// \param __b
502 /// A 128-bit vector of [2 x double].
503 /// \returns A 128-bit vector containing the comparison results.
504 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505  __m128d __b) {
506  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
507 }
508 
509 /// Compares each of the corresponding double-precision values of the
510 /// 128-bit vectors of [2 x double] to determine if the values in the first
511 /// operand are ordered with respect to those in the second operand.
512 ///
513 /// A pair of double-precision values are "ordered" with respect to each
514 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
515 /// 0xFFFFFFFFFFFFFFFF for true.
516 ///
517 /// \headerfile <x86intrin.h>
518 ///
519 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
520 ///
521 /// \param __a
522 /// A 128-bit vector of [2 x double].
523 /// \param __b
524 /// A 128-bit vector of [2 x double].
525 /// \returns A 128-bit vector containing the comparison results.
526 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527  __m128d __b) {
528  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
529 }
530 
531 /// Compares each of the corresponding double-precision values of the
532 /// 128-bit vectors of [2 x double] to determine if the values in the first
533 /// operand are unordered with respect to those in the second operand.
534 ///
535 /// A pair of double-precision values are "unordered" with respect to each
536 /// other if one or both values are NaN. Each comparison yields 0x0 for
537 /// false, 0xFFFFFFFFFFFFFFFF for true.
538 ///
539 /// \headerfile <x86intrin.h>
540 ///
541 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542 /// instruction.
543 ///
544 /// \param __a
545 /// A 128-bit vector of [2 x double].
546 /// \param __b
547 /// A 128-bit vector of [2 x double].
548 /// \returns A 128-bit vector containing the comparison results.
549 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550  __m128d __b) {
551  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
552 }
553 
554 /// Compares each of the corresponding double-precision values of the
555 /// 128-bit vectors of [2 x double] to determine if the values in the first
556 /// operand are unequal to those in the second operand.
557 ///
558 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
559 ///
560 /// \headerfile <x86intrin.h>
561 ///
562 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
563 ///
564 /// \param __a
565 /// A 128-bit vector of [2 x double].
566 /// \param __b
567 /// A 128-bit vector of [2 x double].
568 /// \returns A 128-bit vector containing the comparison results.
569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570  __m128d __b) {
571  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
572 }
573 
574 /// Compares each of the corresponding double-precision values of the
575 /// 128-bit vectors of [2 x double] to determine if the values in the first
576 /// operand are not less than those in the second operand.
577 ///
578 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579 ///
580 /// \headerfile <x86intrin.h>
581 ///
582 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
583 ///
584 /// \param __a
585 /// A 128-bit vector of [2 x double].
586 /// \param __b
587 /// A 128-bit vector of [2 x double].
588 /// \returns A 128-bit vector containing the comparison results.
589 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590  __m128d __b) {
591  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
592 }
593 
594 /// Compares each of the corresponding double-precision values of the
595 /// 128-bit vectors of [2 x double] to determine if the values in the first
596 /// operand are not less than or equal to those in the second operand.
597 ///
598 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
599 ///
600 /// \headerfile <x86intrin.h>
601 ///
602 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
603 ///
604 /// \param __a
605 /// A 128-bit vector of [2 x double].
606 /// \param __b
607 /// A 128-bit vector of [2 x double].
608 /// \returns A 128-bit vector containing the comparison results.
609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610  __m128d __b) {
611  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
612 }
613 
614 /// Compares each of the corresponding double-precision values of the
615 /// 128-bit vectors of [2 x double] to determine if the values in the first
616 /// operand are not greater than those in the second operand.
617 ///
618 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619 ///
620 /// \headerfile <x86intrin.h>
621 ///
622 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
623 ///
624 /// \param __a
625 /// A 128-bit vector of [2 x double].
626 /// \param __b
627 /// A 128-bit vector of [2 x double].
628 /// \returns A 128-bit vector containing the comparison results.
629 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630  __m128d __b) {
631  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
632 }
633 
634 /// Compares each of the corresponding double-precision values of the
635 /// 128-bit vectors of [2 x double] to determine if the values in the first
636 /// operand are not greater than or equal to those in the second operand.
637 ///
638 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
639 ///
640 /// \headerfile <x86intrin.h>
641 ///
642 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
643 ///
644 /// \param __a
645 /// A 128-bit vector of [2 x double].
646 /// \param __b
647 /// A 128-bit vector of [2 x double].
648 /// \returns A 128-bit vector containing the comparison results.
649 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650  __m128d __b) {
651  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
652 }
653 
654 /// Compares the lower double-precision floating-point values in each of
655 /// the two 128-bit floating-point vectors of [2 x double] for equality.
656 ///
657 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658 ///
659 /// \headerfile <x86intrin.h>
660 ///
661 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
662 ///
663 /// \param __a
664 /// A 128-bit vector of [2 x double]. The lower double-precision value is
665 /// compared to the lower double-precision value of \a __b.
666 /// \param __b
667 /// A 128-bit vector of [2 x double]. The lower double-precision value is
668 /// compared to the lower double-precision value of \a __a.
669 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
670 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672  __m128d __b) {
673  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
674 }
675 
676 /// Compares the lower double-precision floating-point values in each of
677 /// the two 128-bit floating-point vectors of [2 x double] to determine if
678 /// the value in the first parameter is less than the corresponding value in
679 /// the second parameter.
680 ///
681 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
682 ///
683 /// \headerfile <x86intrin.h>
684 ///
685 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
686 ///
687 /// \param __a
688 /// A 128-bit vector of [2 x double]. The lower double-precision value is
689 /// compared to the lower double-precision value of \a __b.
690 /// \param __b
691 /// A 128-bit vector of [2 x double]. The lower double-precision value is
692 /// compared to the lower double-precision value of \a __a.
693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
694 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696  __m128d __b) {
697  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
698 }
699 
700 /// Compares the lower double-precision floating-point values in each of
701 /// the two 128-bit floating-point vectors of [2 x double] to determine if
702 /// the value in the first parameter is less than or equal to the
703 /// corresponding value in the second parameter.
704 ///
705 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706 ///
707 /// \headerfile <x86intrin.h>
708 ///
709 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
710 ///
711 /// \param __a
712 /// A 128-bit vector of [2 x double]. The lower double-precision value is
713 /// compared to the lower double-precision value of \a __b.
714 /// \param __b
715 /// A 128-bit vector of [2 x double]. The lower double-precision value is
716 /// compared to the lower double-precision value of \a __a.
717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
718 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720  __m128d __b) {
721  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
722 }
723 
724 /// Compares the lower double-precision floating-point values in each of
725 /// the two 128-bit floating-point vectors of [2 x double] to determine if
726 /// the value in the first parameter is greater than the corresponding value
727 /// in the second parameter.
728 ///
729 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730 ///
731 /// \headerfile <x86intrin.h>
732 ///
733 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
734 ///
735 /// \param __a
736 /// A 128-bit vector of [2 x double]. The lower double-precision value is
737 /// compared to the lower double-precision value of \a __b.
738 /// \param __b
739 /// A 128-bit vector of [2 x double]. The lower double-precision value is
740 /// compared to the lower double-precision value of \a __a.
741 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
742 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744  __m128d __b) {
745  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746  return __extension__(__m128d){__c[0], __a[1]};
747 }
748 
749 /// Compares the lower double-precision floating-point values in each of
750 /// the two 128-bit floating-point vectors of [2 x double] to determine if
751 /// the value in the first parameter is greater than or equal to the
752 /// corresponding value in the second parameter.
753 ///
754 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
755 ///
756 /// \headerfile <x86intrin.h>
757 ///
758 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
759 ///
760 /// \param __a
761 /// A 128-bit vector of [2 x double]. The lower double-precision value is
762 /// compared to the lower double-precision value of \a __b.
763 /// \param __b
764 /// A 128-bit vector of [2 x double]. The lower double-precision value is
765 /// compared to the lower double-precision value of \a __a.
766 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
767 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769  __m128d __b) {
770  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771  return __extension__(__m128d){__c[0], __a[1]};
772 }
773 
774 /// Compares the lower double-precision floating-point values in each of
775 /// the two 128-bit floating-point vectors of [2 x double] to determine if
776 /// the value in the first parameter is "ordered" with respect to the
777 /// corresponding value in the second parameter.
778 ///
779 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780 /// of double-precision values are "ordered" with respect to each other if
781 /// neither value is a NaN.
782 ///
783 /// \headerfile <x86intrin.h>
784 ///
785 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
786 ///
787 /// \param __a
788 /// A 128-bit vector of [2 x double]. The lower double-precision value is
789 /// compared to the lower double-precision value of \a __b.
790 /// \param __b
791 /// A 128-bit vector of [2 x double]. The lower double-precision value is
792 /// compared to the lower double-precision value of \a __a.
793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
794 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796  __m128d __b) {
797  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
798 }
799 
800 /// Compares the lower double-precision floating-point values in each of
801 /// the two 128-bit floating-point vectors of [2 x double] to determine if
802 /// the value in the first parameter is "unordered" with respect to the
803 /// corresponding value in the second parameter.
804 ///
805 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806 /// of double-precision values are "unordered" with respect to each other if
807 /// one or both values are NaN.
808 ///
809 /// \headerfile <x86intrin.h>
810 ///
811 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812 /// instruction.
813 ///
814 /// \param __a
815 /// A 128-bit vector of [2 x double]. The lower double-precision value is
816 /// compared to the lower double-precision value of \a __b.
817 /// \param __b
818 /// A 128-bit vector of [2 x double]. The lower double-precision value is
819 /// compared to the lower double-precision value of \a __a.
820 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
821 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823  __m128d __b) {
824  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
825 }
826 
827 /// Compares the lower double-precision floating-point values in each of
828 /// the two 128-bit floating-point vectors of [2 x double] to determine if
829 /// the value in the first parameter is unequal to the corresponding value in
830 /// the second parameter.
831 ///
832 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
833 ///
834 /// \headerfile <x86intrin.h>
835 ///
836 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
837 ///
838 /// \param __a
839 /// A 128-bit vector of [2 x double]. The lower double-precision value is
840 /// compared to the lower double-precision value of \a __b.
841 /// \param __b
842 /// A 128-bit vector of [2 x double]. The lower double-precision value is
843 /// compared to the lower double-precision value of \a __a.
844 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
845 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847  __m128d __b) {
848  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
849 }
850 
851 /// Compares the lower double-precision floating-point values in each of
852 /// the two 128-bit floating-point vectors of [2 x double] to determine if
853 /// the value in the first parameter is not less than the corresponding
854 /// value in the second parameter.
855 ///
856 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
857 ///
858 /// \headerfile <x86intrin.h>
859 ///
860 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
861 ///
862 /// \param __a
863 /// A 128-bit vector of [2 x double]. The lower double-precision value is
864 /// compared to the lower double-precision value of \a __b.
865 /// \param __b
866 /// A 128-bit vector of [2 x double]. The lower double-precision value is
867 /// compared to the lower double-precision value of \a __a.
868 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
869 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871  __m128d __b) {
872  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
873 }
874 
875 /// Compares the lower double-precision floating-point values in each of
876 /// the two 128-bit floating-point vectors of [2 x double] to determine if
877 /// the value in the first parameter is not less than or equal to the
878 /// corresponding value in the second parameter.
879 ///
880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
885 ///
886 /// \param __a
887 /// A 128-bit vector of [2 x double]. The lower double-precision value is
888 /// compared to the lower double-precision value of \a __b.
889 /// \param __b
890 /// A 128-bit vector of [2 x double]. The lower double-precision value is
891 /// compared to the lower double-precision value of \a __a.
892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895  __m128d __b) {
896  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
897 }
898 
899 /// Compares the lower double-precision floating-point values in each of
900 /// the two 128-bit floating-point vectors of [2 x double] to determine if
901 /// the value in the first parameter is not greater than the corresponding
902 /// value in the second parameter.
903 ///
904 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
905 ///
906 /// \headerfile <x86intrin.h>
907 ///
908 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
909 ///
910 /// \param __a
911 /// A 128-bit vector of [2 x double]. The lower double-precision value is
912 /// compared to the lower double-precision value of \a __b.
913 /// \param __b
914 /// A 128-bit vector of [2 x double]. The lower double-precision value is
915 /// compared to the lower double-precision value of \a __a.
916 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
917 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919  __m128d __b) {
920  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921  return __extension__(__m128d){__c[0], __a[1]};
922 }
923 
924 /// Compares the lower double-precision floating-point values in each of
925 /// the two 128-bit floating-point vectors of [2 x double] to determine if
926 /// the value in the first parameter is not greater than or equal to the
927 /// corresponding value in the second parameter.
928 ///
929 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
930 ///
931 /// \headerfile <x86intrin.h>
932 ///
933 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
934 ///
935 /// \param __a
936 /// A 128-bit vector of [2 x double]. The lower double-precision value is
937 /// compared to the lower double-precision value of \a __b.
938 /// \param __b
939 /// A 128-bit vector of [2 x double]. The lower double-precision value is
940 /// compared to the lower double-precision value of \a __a.
941 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
942 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944  __m128d __b) {
945  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946  return __extension__(__m128d){__c[0], __a[1]};
947 }
948 
949 /// Compares the lower double-precision floating-point values in each of
950 /// the two 128-bit floating-point vectors of [2 x double] for equality.
951 ///
952 /// The comparison yields 0 for false, 1 for true. If either of the two
953 /// lower double-precision values is NaN, 0 is returned.
954 ///
955 /// \headerfile <x86intrin.h>
956 ///
957 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
958 ///
959 /// \param __a
960 /// A 128-bit vector of [2 x double]. The lower double-precision value is
961 /// compared to the lower double-precision value of \a __b.
962 /// \param __b
963 /// A 128-bit vector of [2 x double]. The lower double-precision value is
964 /// compared to the lower double-precision value of \a __a.
965 /// \returns An integer containing the comparison results. If either of the two
966 /// lower double-precision values is NaN, 0 is returned.
967 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968  __m128d __b) {
969  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
970 }
971 
972 /// Compares the lower double-precision floating-point values in each of
973 /// the two 128-bit floating-point vectors of [2 x double] to determine if
974 /// the value in the first parameter is less than the corresponding value in
975 /// the second parameter.
976 ///
977 /// The comparison yields 0 for false, 1 for true. If either of the two
978 /// lower double-precision values is NaN, 0 is returned.
979 ///
980 /// \headerfile <x86intrin.h>
981 ///
982 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
983 ///
984 /// \param __a
985 /// A 128-bit vector of [2 x double]. The lower double-precision value is
986 /// compared to the lower double-precision value of \a __b.
987 /// \param __b
988 /// A 128-bit vector of [2 x double]. The lower double-precision value is
989 /// compared to the lower double-precision value of \a __a.
990 /// \returns An integer containing the comparison results. If either of the two
991 /// lower double-precision values is NaN, 0 is returned.
992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993  __m128d __b) {
994  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995 }
996 
997 /// Compares the lower double-precision floating-point values in each of
998 /// the two 128-bit floating-point vectors of [2 x double] to determine if
999 /// the value in the first parameter is less than or equal to the
1000 /// corresponding value in the second parameter.
1001 ///
1002 /// The comparison yields 0 for false, 1 for true. If either of the two
1003 /// lower double-precision values is NaN, 0 is returned.
1004 ///
1005 /// \headerfile <x86intrin.h>
1006 ///
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008 ///
1009 /// \param __a
1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1011 /// compared to the lower double-precision value of \a __b.
1012 /// \param __b
1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1014 /// compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results. If either of the two
1016 /// lower double-precision values is NaN, 0 is returned.
1017 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018  __m128d __b) {
1019  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1020 }
1021 
1022 /// Compares the lower double-precision floating-point values in each of
1023 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1024 /// the value in the first parameter is greater than the corresponding value
1025 /// in the second parameter.
1026 ///
1027 /// The comparison yields 0 for false, 1 for true. If either of the two
1028 /// lower double-precision values is NaN, 0 is returned.
1029 ///
1030 /// \headerfile <x86intrin.h>
1031 ///
1032 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1033 ///
1034 /// \param __a
1035 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1036 /// compared to the lower double-precision value of \a __b.
1037 /// \param __b
1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1039 /// compared to the lower double-precision value of \a __a.
1040 /// \returns An integer containing the comparison results. If either of the two
1041 /// lower double-precision values is NaN, 0 is returned.
1042 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043  __m128d __b) {
1044  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045 }
1046 
1047 /// Compares the lower double-precision floating-point values in each of
1048 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1049 /// the value in the first parameter is greater than or equal to the
1050 /// corresponding value in the second parameter.
1051 ///
1052 /// The comparison yields 0 for false, 1 for true. If either of the two
1053 /// lower double-precision values is NaN, 0 is returned.
1054 ///
1055 /// \headerfile <x86intrin.h>
1056 ///
1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058 ///
1059 /// \param __a
1060 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1061 /// compared to the lower double-precision value of \a __b.
1062 /// \param __b
1063 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1064 /// compared to the lower double-precision value of \a __a.
1065 /// \returns An integer containing the comparison results. If either of the two
1066 /// lower double-precision values is NaN, 0 is returned.
1067 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068  __m128d __b) {
1069  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070 }
1071 
1072 /// Compares the lower double-precision floating-point values in each of
1073 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1074 /// the value in the first parameter is unequal to the corresponding value in
1075 /// the second parameter.
1076 ///
1077 /// The comparison yields 0 for false, 1 for true. If either of the two
1078 /// lower double-precision values is NaN, 1 is returned.
1079 ///
1080 /// \headerfile <x86intrin.h>
1081 ///
1082 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1083 ///
1084 /// \param __a
1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1086 /// compared to the lower double-precision value of \a __b.
1087 /// \param __b
1088 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1089 /// compared to the lower double-precision value of \a __a.
1090 /// \returns An integer containing the comparison results. If either of the two
1091 /// lower double-precision values is NaN, 1 is returned.
1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093  __m128d __b) {
1094  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1095 }
1096 
1097 /// Compares the lower double-precision floating-point values in each of
1098 /// the two 128-bit floating-point vectors of [2 x double] for equality. The
1099 /// comparison yields 0 for false, 1 for true.
1100 ///
1101 /// If either of the two lower double-precision values is NaN, 0 is returned.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1106 ///
1107 /// \param __a
1108 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1109 /// compared to the lower double-precision value of \a __b.
1110 /// \param __b
1111 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1112 /// compared to the lower double-precision value of \a __a.
1113 /// \returns An integer containing the comparison results. If either of the two
1114 /// lower double-precision values is NaN, 0 is returned.
1115 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116  __m128d __b) {
1117  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1118 }
1119 
1120 /// Compares the lower double-precision floating-point values in each of
1121 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1122 /// the value in the first parameter is less than the corresponding value in
1123 /// the second parameter.
1124 ///
1125 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1126 /// double-precision values is NaN, 0 is returned.
1127 ///
1128 /// \headerfile <x86intrin.h>
1129 ///
1130 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1131 ///
1132 /// \param __a
1133 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1134 /// compared to the lower double-precision value of \a __b.
1135 /// \param __b
1136 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1137 /// compared to the lower double-precision value of \a __a.
1138 /// \returns An integer containing the comparison results. If either of the two
1139 /// lower double-precision values is NaN, 0 is returned.
1140 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141  __m128d __b) {
1142  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1143 }
1144 
1145 /// Compares the lower double-precision floating-point values in each of
1146 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1147 /// the value in the first parameter is less than or equal to the
1148 /// corresponding value in the second parameter.
1149 ///
1150 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1151 /// double-precision values is NaN, 0 is returned.
1152 ///
1153 /// \headerfile <x86intrin.h>
1154 ///
1155 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1156 ///
1157 /// \param __a
1158 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1159 /// compared to the lower double-precision value of \a __b.
1160 /// \param __b
1161 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1162 /// compared to the lower double-precision value of \a __a.
1163 /// \returns An integer containing the comparison results. If either of the two
1164 /// lower double-precision values is NaN, 0 is returned.
1165 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166  __m128d __b) {
1167  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1168 }
1169 
1170 /// Compares the lower double-precision floating-point values in each of
1171 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1172 /// the value in the first parameter is greater than the corresponding value
1173 /// in the second parameter.
1174 ///
1175 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1176 /// double-precision values is NaN, 0 is returned.
1177 ///
1178 /// \headerfile <x86intrin.h>
1179 ///
1180 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181 ///
1182 /// \param __a
1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1184 /// compared to the lower double-precision value of \a __b.
1185 /// \param __b
1186 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1187 /// compared to the lower double-precision value of \a __a.
1188 /// \returns An integer containing the comparison results. If either of the two
1189 /// lower double-precision values is NaN, 0 is returned.
1190 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191  __m128d __b) {
1192  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1193 }
1194 
1195 /// Compares the lower double-precision floating-point values in each of
1196 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1197 /// the value in the first parameter is greater than or equal to the
1198 /// corresponding value in the second parameter.
1199 ///
1200 /// The comparison yields 0 for false, 1 for true. If either of the two
1201 /// lower double-precision values is NaN, 0 is returned.
1202 ///
1203 /// \headerfile <x86intrin.h>
1204 ///
1205 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1206 ///
1207 /// \param __a
1208 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1209 /// compared to the lower double-precision value of \a __b.
1210 /// \param __b
1211 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1212 /// compared to the lower double-precision value of \a __a.
1213 /// \returns An integer containing the comparison results. If either of the two
1214 /// lower double-precision values is NaN, 0 is returned.
1215 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216  __m128d __b) {
1217  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1218 }
1219 
1220 /// Compares the lower double-precision floating-point values in each of
1221 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1222 /// the value in the first parameter is unequal to the corresponding value in
1223 /// the second parameter.
1224 ///
1225 /// The comparison yields 0 for false, 1 for true. If either of the two lower
1226 /// double-precision values is NaN, 1 is returned.
1227 ///
1228 /// \headerfile <x86intrin.h>
1229 ///
1230 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231 ///
1232 /// \param __a
1233 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1234 /// compared to the lower double-precision value of \a __b.
1235 /// \param __b
1236 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1237 /// compared to the lower double-precision value of \a __a.
1238 /// \returns An integer containing the comparison result. If either of the two
1239 /// lower double-precision values is NaN, 1 is returned.
1240 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241  __m128d __b) {
1242  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1243 }
1244 
1245 /// Converts the two double-precision floating-point elements of a
1246 /// 128-bit vector of [2 x double] into two single-precision floating-point
1247 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248 /// The upper 64 bits of the result vector are set to zero.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1253 ///
1254 /// \param __a
1255 /// A 128-bit vector of [2 x double].
1256 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257 /// converted values. The upper 64 bits are set to zero.
1258 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1260 }
1261 
1262 /// Converts the lower two single-precision floating-point elements of a
1263 /// 128-bit vector of [4 x float] into two double-precision floating-point
1264 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1265 /// elements of the input vector are unused.
1266 ///
1267 /// \headerfile <x86intrin.h>
1268 ///
1269 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1270 ///
1271 /// \param __a
1272 /// A 128-bit vector of [4 x float]. The lower two single-precision
1273 /// floating-point elements are converted to double-precision values. The
1274 /// upper two elements are unused.
1275 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1276 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277  return (__m128d) __builtin_convertvector(
1278  __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279 }
1280 
1281 /// Converts the lower two integer elements of a 128-bit vector of
1282 /// [4 x i32] into two double-precision floating-point values, returned in a
1283 /// 128-bit vector of [2 x double].
1284 ///
1285 /// The upper two elements of the input vector are unused.
1286 ///
1287 /// \headerfile <x86intrin.h>
1288 ///
1289 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1290 ///
1291 /// \param __a
1292 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293 /// converted to double-precision values.
1294 ///
1295 /// The upper two elements are unused.
1296 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298  return (__m128d) __builtin_convertvector(
1299  __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1300 }
1301 
1302 /// Converts the two double-precision floating-point elements of a
1303 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1304 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305 /// 64 bits of the result vector are set to zero.
1306 ///
1307 /// \headerfile <x86intrin.h>
1308 ///
1309 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1310 ///
1311 /// \param __a
1312 /// A 128-bit vector of [2 x double].
1313 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314 /// converted values. The upper 64 bits are set to zero.
1315 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1317 }
1318 
1319 /// Converts the low-order element of a 128-bit vector of [2 x double]
1320 /// into a 32-bit signed integer value.
1321 ///
1322 /// \headerfile <x86intrin.h>
1323 ///
1324 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1325 ///
1326 /// \param __a
1327 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328 /// conversion.
1329 /// \returns A 32-bit signed integer containing the converted value.
1330 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331  return __builtin_ia32_cvtsd2si((__v2df)__a);
1332 }
1333 
1334 /// Converts the lower double-precision floating-point element of a
1335 /// 128-bit vector of [2 x double], in the second parameter, into a
1336 /// single-precision floating-point value, returned in the lower 32 bits of a
1337 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338 /// copied from the upper 96 bits of the first parameter.
1339 ///
1340 /// \headerfile <x86intrin.h>
1341 ///
1342 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1343 ///
1344 /// \param __a
1345 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346 /// copied to the upper 96 bits of the result.
1347 /// \param __b
1348 /// A 128-bit vector of [2 x double]. The lower double-precision
1349 /// floating-point element is used in the conversion.
1350 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351 /// converted value from the second parameter. The upper 96 bits are copied
1352 /// from the upper 96 bits of the first parameter.
1353 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354  __m128d __b) {
1355  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356 }
1357 
1358 /// Converts a 32-bit signed integer value, in the second parameter, into
1359 /// a double-precision floating-point value, returned in the lower 64 bits of
1360 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361 /// are copied from the upper 64 bits of the first parameter.
1362 ///
1363 /// \headerfile <x86intrin.h>
1364 ///
1365 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1366 ///
1367 /// \param __a
1368 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369 /// copied to the upper 64 bits of the result.
1370 /// \param __b
1371 /// A 32-bit signed integer containing the value to be converted.
1372 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373 /// converted value from the second parameter. The upper 64 bits are copied
1374 /// from the upper 64 bits of the first parameter.
1375 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376  int __b) {
1377  __a[0] = __b;
1378  return __a;
1379 }
1380 
1381 /// Converts the lower single-precision floating-point element of a
1382 /// 128-bit vector of [4 x float], in the second parameter, into a
1383 /// double-precision floating-point value, returned in the lower 64 bits of
1384 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385 /// are copied from the upper 64 bits of the first parameter.
1386 ///
1387 /// \headerfile <x86intrin.h>
1388 ///
1389 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1390 ///
1391 /// \param __a
1392 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393 /// copied to the upper 64 bits of the result.
1394 /// \param __b
1395 /// A 128-bit vector of [4 x float]. The lower single-precision
1396 /// floating-point element is used in the conversion.
1397 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398 /// converted value from the second parameter. The upper 64 bits are copied
1399 /// from the upper 64 bits of the first parameter.
1400 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401  __m128 __b) {
1402  __a[0] = __b[0];
1403  return __a;
1404 }
1405 
1406 /// Converts the two double-precision floating-point elements of a
1407 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1408 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1409 ///
1410 /// If the result of either conversion is inexact, the result is truncated
1411 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1412 /// 64 bits of the result vector are set to zero.
1413 ///
1414 /// \headerfile <x86intrin.h>
1415 ///
1416 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417 /// instruction.
1418 ///
1419 /// \param __a
1420 /// A 128-bit vector of [2 x double].
1421 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422 /// converted values. The upper 64 bits are set to zero.
1423 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1425 }
1426 
1427 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1428 /// signed integer value, truncating the result when it is inexact.
1429 ///
1430 /// \headerfile <x86intrin.h>
1431 ///
1432 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433 /// instruction.
1434 ///
1435 /// \param __a
1436 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437 /// conversion.
1438 /// \returns A 32-bit signed integer containing the converted value.
1439 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440  return __builtin_ia32_cvttsd2si((__v2df)__a);
1441 }
1442 
1443 /// Converts the two double-precision floating-point elements of a
1444 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1445 /// returned in a 64-bit vector of [2 x i32].
1446 ///
1447 /// \headerfile <x86intrin.h>
1448 ///
1449 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1450 ///
1451 /// \param __a
1452 /// A 128-bit vector of [2 x double].
1453 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1456 }
1457 
1458 /// Converts the two double-precision floating-point elements of a
1459 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1460 /// returned in a 64-bit vector of [2 x i32].
1461 ///
1462 /// If the result of either conversion is inexact, the result is truncated
1463 /// (rounded towards zero) regardless of the current MXCSR setting.
1464 ///
1465 /// \headerfile <x86intrin.h>
1466 ///
1467 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1468 ///
1469 /// \param __a
1470 /// A 128-bit vector of [2 x double].
1471 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1474 }
1475 
1476 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477 /// [2 x i32] into two double-precision floating-point values, returned in a
1478 /// 128-bit vector of [2 x double].
1479 ///
1480 /// \headerfile <x86intrin.h>
1481 ///
1482 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1483 ///
1484 /// \param __a
1485 /// A 64-bit vector of [2 x i32].
1486 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1487 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1489 }
1490 
1491 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1492 /// a double-precision floating-point value.
1493 ///
1494 /// \headerfile <x86intrin.h>
1495 ///
1496 /// This intrinsic has no corresponding instruction.
1497 ///
1498 /// \param __a
1499 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500 /// \returns A double-precision floating-point value copied from the lower 64
1501 /// bits of \a __a.
1502 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503  return __a[0];
1504 }
1505 
1506 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507 /// memory location.
1508 ///
1509 /// \headerfile <x86intrin.h>
1510 ///
1511 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1512 ///
1513 /// \param __dp
1514 /// A pointer to a 128-bit memory location. The address of the memory
1515 /// location has to be 16-byte aligned.
1516 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518  return *(const __m128d *)__dp;
1519 }
1520 
1521 /// Loads a double-precision floating-point value from a specified memory
1522 /// location and duplicates it to both vector elements of a 128-bit vector of
1523 /// [2 x double].
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1528 ///
1529 /// \param __dp
1530 /// A pointer to a memory location containing a double-precision value.
1531 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1532 /// duplicated values.
1533 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534  struct __mm_load1_pd_struct {
1535  double __u;
1536  } __attribute__((__packed__, __may_alias__));
1537  double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538  return __extension__(__m128d){__u, __u};
1539 }
1540 
1541 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1542 
1543 /// Loads two double-precision values, in reverse order, from an aligned
1544 /// memory location into a 128-bit vector of [2 x double].
1545 ///
1546 /// \headerfile <x86intrin.h>
1547 ///
1548 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1551 ///
1552 /// \param __dp
1553 /// A 16-byte aligned pointer to an array of double-precision values to be
1554 /// loaded in reverse order.
1555 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556 /// values.
1557 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558  __m128d __u = *(const __m128d *)__dp;
1559  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1560 }
1561 
1562 /// Loads a 128-bit floating-point vector of [2 x double] from an
1563 /// unaligned memory location.
1564 ///
1565 /// \headerfile <x86intrin.h>
1566 ///
1567 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1568 ///
1569 /// \param __dp
1570 /// A pointer to a 128-bit memory location. The address of the memory
1571 /// location does not have to be aligned.
1572 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574  struct __loadu_pd {
1575  __m128d_u __v;
1576  } __attribute__((__packed__, __may_alias__));
1577  return ((const struct __loadu_pd *)__dp)->__v;
1578 }
1579 
1580 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1581 /// vector and clears the upper element.
1582 ///
1583 /// \headerfile <x86intrin.h>
1584 ///
1585 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1586 ///
1587 /// \param __a
1588 /// A pointer to a 64-bit memory location. The address of the memory
1589 /// location does not have to be aligned.
1590 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592  struct __loadu_si64 {
1593  long long __v;
1594  } __attribute__((__packed__, __may_alias__));
1595  long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596  return __extension__(__m128i)(__v2di){__u, 0LL};
1597 }
1598 
1599 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1600 /// vector and clears the upper element.
1601 ///
1602 /// \headerfile <x86intrin.h>
1603 ///
1604 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1605 ///
1606 /// \param __a
1607 /// A pointer to a 32-bit memory location. The address of the memory
1608 /// location does not have to be aligned.
1609 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611  struct __loadu_si32 {
1612  int __v;
1613  } __attribute__((__packed__, __may_alias__));
1614  int __u = ((const struct __loadu_si32 *)__a)->__v;
1615  return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1616 }
1617 
1618 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1619 /// vector and clears the upper element.
1620 ///
1621 /// \headerfile <x86intrin.h>
1622 ///
1623 /// This intrinsic does not correspond to a specific instruction.
1624 ///
1625 /// \param __a
1626 /// A pointer to a 16-bit memory location. The address of the memory
1627 /// location does not have to be aligned.
1628 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630  struct __loadu_si16 {
1631  short __v;
1632  } __attribute__((__packed__, __may_alias__));
1633  short __u = ((const struct __loadu_si16 *)__a)->__v;
1634  return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1635 }
1636 
1637 /// Loads a 64-bit double-precision value to the low element of a
1638 /// 128-bit integer vector and clears the upper element.
1639 ///
1640 /// \headerfile <x86intrin.h>
1641 ///
1642 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1643 ///
1644 /// \param __dp
1645 /// A pointer to a memory location containing a double-precision value.
1646 /// The address of the memory location does not have to be aligned.
1647 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649  struct __mm_load_sd_struct {
1650  double __u;
1651  } __attribute__((__packed__, __may_alias__));
1652  double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653  return __extension__(__m128d){__u, 0};
1654 }
1655 
1656 /// Loads a double-precision value into the high-order bits of a 128-bit
1657 /// vector of [2 x double]. The low-order bits are copied from the low-order
1658 /// bits of the first operand.
1659 ///
1660 /// \headerfile <x86intrin.h>
1661 ///
1662 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1663 ///
1664 /// \param __a
1665 /// A 128-bit vector of [2 x double]. \n
1666 /// Bits [63:0] are written to bits [63:0] of the result.
1667 /// \param __dp
1668 /// A pointer to a 64-bit memory location containing a double-precision
1669 /// floating-point value that is loaded. The loaded value is written to bits
1670 /// [127:64] of the result. The address of the memory location does not have
1671 /// to be aligned.
1672 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674  double const *__dp) {
1675  struct __mm_loadh_pd_struct {
1676  double __u;
1677  } __attribute__((__packed__, __may_alias__));
1678  double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679  return __extension__(__m128d){__a[0], __u};
1680 }
1681 
1682 /// Loads a double-precision value into the low-order bits of a 128-bit
1683 /// vector of [2 x double]. The high-order bits are copied from the
1684 /// high-order bits of the first operand.
1685 ///
1686 /// \headerfile <x86intrin.h>
1687 ///
1688 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1689 ///
1690 /// \param __a
1691 /// A 128-bit vector of [2 x double]. \n
1692 /// Bits [127:64] are written to bits [127:64] of the result.
1693 /// \param __dp
1694 /// A pointer to a 64-bit memory location containing a double-precision
1695 /// floating-point value that is loaded. The loaded value is written to bits
1696 /// [63:0] of the result. The address of the memory location does not have to
1697 /// be aligned.
1698 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1699 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700  double const *__dp) {
1701  struct __mm_loadl_pd_struct {
1702  double __u;
1703  } __attribute__((__packed__, __may_alias__));
1704  double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705  return __extension__(__m128d){__u, __a[1]};
1706 }
1707 
1708 /// Constructs a 128-bit floating-point vector of [2 x double] with
1709 /// unspecified content. This could be used as an argument to another
1710 /// intrinsic function where the argument is required but the value is not
1711 /// actually used.
1712 ///
1713 /// \headerfile <x86intrin.h>
1714 ///
1715 /// This intrinsic has no corresponding instruction.
1716 ///
1717 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718 /// content.
1719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720  return (__m128d)__builtin_ia32_undef128();
1721 }
1722 
1723 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724 /// 64 bits of the vector are initialized with the specified double-precision
1725 /// floating-point value. The upper 64 bits are set to zero.
1726 ///
1727 /// \headerfile <x86intrin.h>
1728 ///
1729 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1730 ///
1731 /// \param __w
1732 /// A double-precision floating-point value used to initialize the lower 64
1733 /// bits of the result.
1734 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1736 /// set to zero.
1737 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738  return __extension__(__m128d){__w, 0};
1739 }
1740 
1741 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1742 /// of the two double-precision floating-point vector elements set to the
1743 /// specified double-precision floating-point value.
1744 ///
1745 /// \headerfile <x86intrin.h>
1746 ///
1747 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1748 ///
1749 /// \param __w
1750 /// A double-precision floating-point value used to initialize each vector
1751 /// element of the result.
1752 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1753 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754  return __extension__(__m128d){__w, __w};
1755 }
1756 
1757 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1758 /// of the two double-precision floating-point vector elements set to the
1759 /// specified double-precision floating-point value.
1760 ///
1761 /// \headerfile <x86intrin.h>
1762 ///
1763 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1764 ///
1765 /// \param __w
1766 /// A double-precision floating-point value used to initialize each vector
1767 /// element of the result.
1768 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770  return _mm_set1_pd(__w);
1771 }
1772 
1773 /// Constructs a 128-bit floating-point vector of [2 x double]
1774 /// initialized with the specified double-precision floating-point values.
1775 ///
1776 /// \headerfile <x86intrin.h>
1777 ///
1778 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1779 ///
1780 /// \param __w
1781 /// A double-precision floating-point value used to initialize the upper 64
1782 /// bits of the result.
1783 /// \param __x
1784 /// A double-precision floating-point value used to initialize the lower 64
1785 /// bits of the result.
1786 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1787 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788  double __x) {
1789  return __extension__(__m128d){__x, __w};
1790 }
1791 
1792 /// Constructs a 128-bit floating-point vector of [2 x double],
1793 /// initialized in reverse order with the specified double-precision
1794 /// floating-point values.
1795 ///
1796 /// \headerfile <x86intrin.h>
1797 ///
1798 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1799 ///
1800 /// \param __w
1801 /// A double-precision floating-point value used to initialize the lower 64
1802 /// bits of the result.
1803 /// \param __x
1804 /// A double-precision floating-point value used to initialize the upper 64
1805 /// bits of the result.
1806 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808  double __x) {
1809  return __extension__(__m128d){__w, __x};
1810 }
1811 
1812 /// Constructs a 128-bit floating-point vector of [2 x double]
1813 /// initialized to zero.
1814 ///
1815 /// \headerfile <x86intrin.h>
1816 ///
1817 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1818 ///
1819 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820 /// all elements set to zero.
1821 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822  return __extension__(__m128d){0.0, 0.0};
1823 }
1824 
1825 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1827 /// 64 bits are set to the upper 64 bits of the first parameter.
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1832 ///
1833 /// \param __a
1834 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835 /// upper 64 bits of the result.
1836 /// \param __b
1837 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838 /// lower 64 bits of the result.
1839 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1840 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841  __m128d __b) {
1842  __a[0] = __b[0];
1843  return __a;
1844 }
1845 
1846 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847 /// memory location.
1848 ///
1849 /// \headerfile <x86intrin.h>
1850 ///
1851 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1852 ///
1853 /// \param __dp
1854 /// A pointer to a 64-bit memory location.
1855 /// \param __a
1856 /// A 128-bit vector of [2 x double] containing the value to be stored.
1857 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858  __m128d __a) {
1859  struct __mm_store_sd_struct {
1860  double __u;
1861  } __attribute__((__packed__, __may_alias__));
1862  ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1863 }
1864 
1865 /// Moves packed double-precision values from a 128-bit vector of
1866 /// [2 x double] to a memory location.
1867 ///
1868 /// \headerfile <x86intrin.h>
1869 ///
1870 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1871 ///
1872 /// \param __dp
1873 /// A pointer to an aligned memory location that can store two
1874 /// double-precision values.
1875 /// \param __a
1876 /// A packed 128-bit vector of [2 x double] containing the values to be
1877 /// moved.
1878 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879  __m128d __a) {
1880  *(__m128d *)__dp = __a;
1881 }
1882 
1883 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884 /// the upper and lower 64 bits of a memory location.
1885 ///
1886 /// \headerfile <x86intrin.h>
1887 ///
1888 /// This intrinsic corresponds to the
1889 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1890 ///
1891 /// \param __dp
1892 /// A pointer to a memory location that can store two double-precision
1893 /// values.
1894 /// \param __a
1895 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896 /// of the values in \a __dp.
1897 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898  __m128d __a) {
1899  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900  _mm_store_pd(__dp, __a);
1901 }
1902 
1903 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904 /// the upper and lower 64 bits of a memory location.
1905 ///
1906 /// \headerfile <x86intrin.h>
1907 ///
1908 /// This intrinsic corresponds to the
1909 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1910 ///
1911 /// \param __dp
1912 /// A pointer to a memory location that can store two double-precision
1913 /// values.
1914 /// \param __a
1915 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916 /// of the values in \a __dp.
1917 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918  __m128d __a) {
1919  _mm_store1_pd(__dp, __a);
1920 }
1921 
1922 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923 /// location.
1924 ///
1925 /// \headerfile <x86intrin.h>
1926 ///
1927 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1928 ///
1929 /// \param __dp
1930 /// A pointer to a 128-bit memory location. The address of the memory
1931 /// location does not have to be aligned.
1932 /// \param __a
1933 /// A 128-bit vector of [2 x double] containing the values to be stored.
1934 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935  __m128d __a) {
1936  struct __storeu_pd {
1937  __m128d_u __v;
1938  } __attribute__((__packed__, __may_alias__));
1939  ((struct __storeu_pd *)__dp)->__v = __a;
1940 }
1941 
1942 /// Stores two double-precision values, in reverse order, from a 128-bit
1943 /// vector of [2 x double] to a 16-byte aligned memory location.
1944 ///
1945 /// \headerfile <x86intrin.h>
1946 ///
1947 /// This intrinsic corresponds to a shuffling instruction followed by a
1948 /// <c> VMOVAPD / MOVAPD </c> instruction.
1949 ///
1950 /// \param __dp
1951 /// A pointer to a 16-byte aligned memory location that can store two
1952 /// double-precision values.
1953 /// \param __a
1954 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1955 /// stored.
1956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957  __m128d __a) {
1958  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959  *(__m128d *)__dp = __a;
1960 }
1961 
1962 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963 /// memory location.
1964 ///
1965 /// \headerfile <x86intrin.h>
1966 ///
1967 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1968 ///
1969 /// \param __dp
1970 /// A pointer to a 64-bit memory location.
1971 /// \param __a
1972 /// A 128-bit vector of [2 x double] containing the value to be stored.
1973 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974  __m128d __a) {
1975  struct __mm_storeh_pd_struct {
1976  double __u;
1977  } __attribute__((__packed__, __may_alias__));
1978  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1979 }
1980 
1981 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982 /// memory location.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1987 ///
1988 /// \param __dp
1989 /// A pointer to a 64-bit memory location.
1990 /// \param __a
1991 /// A 128-bit vector of [2 x double] containing the value to be stored.
1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993  __m128d __a) {
1994  struct __mm_storeh_pd_struct {
1995  double __u;
1996  } __attribute__((__packed__, __may_alias__));
1997  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1998 }
1999 
2000 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001 /// saving the lower 8 bits of each sum in the corresponding element of a
2002 /// 128-bit result vector of [16 x i8].
2003 ///
2004 /// The integer elements of both parameters can be either signed or unsigned.
2005 ///
2006 /// \headerfile <x86intrin.h>
2007 ///
2008 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2009 ///
2010 /// \param __a
2011 /// A 128-bit vector of [16 x i8].
2012 /// \param __b
2013 /// A 128-bit vector of [16 x i8].
2014 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015 /// parameters.
2016 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017  __m128i __b) {
2018  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2019 }
2020 
2021 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022 /// saving the lower 16 bits of each sum in the corresponding element of a
2023 /// 128-bit result vector of [8 x i16].
2024 ///
2025 /// The integer elements of both parameters can be either signed or unsigned.
2026 ///
2027 /// \headerfile <x86intrin.h>
2028 ///
2029 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2030 ///
2031 /// \param __a
2032 /// A 128-bit vector of [8 x i16].
2033 /// \param __b
2034 /// A 128-bit vector of [8 x i16].
2035 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036 /// parameters.
2037 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038  __m128i __b) {
2039  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2040 }
2041 
2042 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043 /// saving the lower 32 bits of each sum in the corresponding element of a
2044 /// 128-bit result vector of [4 x i32].
2045 ///
2046 /// The integer elements of both parameters can be either signed or unsigned.
2047 ///
2048 /// \headerfile <x86intrin.h>
2049 ///
2050 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2051 ///
2052 /// \param __a
2053 /// A 128-bit vector of [4 x i32].
2054 /// \param __b
2055 /// A 128-bit vector of [4 x i32].
2056 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057 /// parameters.
2058 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059  __m128i __b) {
2060  return (__m128i)((__v4su)__a + (__v4su)__b);
2061 }
2062 
2063 /// Adds two signed or unsigned 64-bit integer values, returning the
2064 /// lower 64 bits of the sum.
2065 ///
2066 /// \headerfile <x86intrin.h>
2067 ///
2068 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2069 ///
2070 /// \param __a
2071 /// A 64-bit integer.
2072 /// \param __b
2073 /// A 64-bit integer.
2074 /// \returns A 64-bit integer containing the sum of both parameters.
2075 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076  __m64 __b) {
2077  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2078 }
2079 
2080 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081 /// saving the lower 64 bits of each sum in the corresponding element of a
2082 /// 128-bit result vector of [2 x i64].
2083 ///
2084 /// The integer elements of both parameters can be either signed or unsigned.
2085 ///
2086 /// \headerfile <x86intrin.h>
2087 ///
2088 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2089 ///
2090 /// \param __a
2091 /// A 128-bit vector of [2 x i64].
2092 /// \param __b
2093 /// A 128-bit vector of [2 x i64].
2094 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095 /// parameters.
2096 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097  __m128i __b) {
2098  return (__m128i)((__v2du)__a + (__v2du)__b);
2099 }
2100 
2101 /// Adds, with saturation, the corresponding elements of two 128-bit
2102 /// signed [16 x i8] vectors, saving each sum in the corresponding element of
2103 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2105 ///
2106 /// \headerfile <x86intrin.h>
2107 ///
2108 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2109 ///
2110 /// \param __a
2111 /// A 128-bit signed [16 x i8] vector.
2112 /// \param __b
2113 /// A 128-bit signed [16 x i8] vector.
2114 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115 /// both parameters.
2116 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2117  __m128i __b) {
2118  return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2119 }
2120 
2121 /// Adds, with saturation, the corresponding elements of two 128-bit
2122 /// signed [8 x i16] vectors, saving each sum in the corresponding element of
2123 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2125 /// 0x8000.
2126 ///
2127 /// \headerfile <x86intrin.h>
2128 ///
2129 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2130 ///
2131 /// \param __a
2132 /// A 128-bit signed [8 x i16] vector.
2133 /// \param __b
2134 /// A 128-bit signed [8 x i16] vector.
2135 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136 /// both parameters.
2137 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2138  __m128i __b) {
2139  return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2140 }
2141 
2142 /// Adds, with saturation, the corresponding elements of two 128-bit
2143 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145 /// are saturated to 0xFF. Negative sums are saturated to 0x00.
2146 ///
2147 /// \headerfile <x86intrin.h>
2148 ///
2149 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2150 ///
2151 /// \param __a
2152 /// A 128-bit unsigned [16 x i8] vector.
2153 /// \param __b
2154 /// A 128-bit unsigned [16 x i8] vector.
2155 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156 /// of both parameters.
2157 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2158  __m128i __b) {
2159  return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2160 }
2161 
2162 /// Adds, with saturation, the corresponding elements of two 128-bit
2163 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2166 ///
2167 /// \headerfile <x86intrin.h>
2168 ///
2169 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2170 ///
2171 /// \param __a
2172 /// A 128-bit unsigned [8 x i16] vector.
2173 /// \param __b
2174 /// A 128-bit unsigned [8 x i16] vector.
2175 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176 /// of both parameters.
2177 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2178  __m128i __b) {
2179  return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2180 }
2181 
2182 /// Computes the rounded averages of corresponding elements of two
2183 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2184 /// corresponding element of a 128-bit result vector of [16 x i8].
2185 ///
2186 /// \headerfile <x86intrin.h>
2187 ///
2188 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2189 ///
2190 /// \param __a
2191 /// A 128-bit unsigned [16 x i8] vector.
2192 /// \param __b
2193 /// A 128-bit unsigned [16 x i8] vector.
2194 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195 /// averages of both parameters.
2196 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2197  __m128i __b) {
2198  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2199 }
2200 
2201 /// Computes the rounded averages of corresponding elements of two
2202 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2203 /// corresponding element of a 128-bit result vector of [8 x i16].
2204 ///
2205 /// \headerfile <x86intrin.h>
2206 ///
2207 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2208 ///
2209 /// \param __a
2210 /// A 128-bit unsigned [8 x i16] vector.
2211 /// \param __b
2212 /// A 128-bit unsigned [8 x i16] vector.
2213 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214 /// averages of both parameters.
2215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2216  __m128i __b) {
2217  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2218 }
2219 
2220 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221 /// vectors, producing eight intermediate 32-bit signed integer products, and
2222 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223 /// [4 x i32] vector.
2224 ///
2225 /// For example, bits [15:0] of both parameters are multiplied producing a
2226 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2227 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2228 /// of the result.
2229 ///
2230 /// \headerfile <x86intrin.h>
2231 ///
2232 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2233 ///
2234 /// \param __a
2235 /// A 128-bit signed [8 x i16] vector.
2236 /// \param __b
2237 /// A 128-bit signed [8 x i16] vector.
2238 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239 /// of both parameters.
2240 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2241  __m128i __b) {
2242  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2243 }
2244 
2245 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2246 /// vectors, saving the greater value from each comparison in the
2247 /// corresponding element of a 128-bit result vector of [8 x i16].
2248 ///
2249 /// \headerfile <x86intrin.h>
2250 ///
2251 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2252 ///
2253 /// \param __a
2254 /// A 128-bit signed [8 x i16] vector.
2255 /// \param __b
2256 /// A 128-bit signed [8 x i16] vector.
2257 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258 /// each comparison.
2259 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2260  __m128i __b) {
2261  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2262 }
2263 
2264 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265 /// vectors, saving the greater value from each comparison in the
2266 /// corresponding element of a 128-bit result vector of [16 x i8].
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2271 ///
2272 /// \param __a
2273 /// A 128-bit unsigned [16 x i8] vector.
2274 /// \param __b
2275 /// A 128-bit unsigned [16 x i8] vector.
2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277 /// each comparison.
2278 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2279  __m128i __b) {
2280  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2281 }
2282 
2283 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2284 /// vectors, saving the smaller value from each comparison in the
2285 /// corresponding element of a 128-bit result vector of [8 x i16].
2286 ///
2287 /// \headerfile <x86intrin.h>
2288 ///
2289 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2290 ///
2291 /// \param __a
2292 /// A 128-bit signed [8 x i16] vector.
2293 /// \param __b
2294 /// A 128-bit signed [8 x i16] vector.
2295 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296 /// each comparison.
2297 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2298  __m128i __b) {
2299  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2300 }
2301 
2302 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303 /// vectors, saving the smaller value from each comparison in the
2304 /// corresponding element of a 128-bit result vector of [16 x i8].
2305 ///
2306 /// \headerfile <x86intrin.h>
2307 ///
2308 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2309 ///
2310 /// \param __a
2311 /// A 128-bit unsigned [16 x i8] vector.
2312 /// \param __b
2313 /// A 128-bit unsigned [16 x i8] vector.
2314 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315 /// each comparison.
2316 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2317  __m128i __b) {
2318  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2319 }
2320 
2321 /// Multiplies the corresponding elements of two signed [8 x i16]
2322 /// vectors, saving the upper 16 bits of each 32-bit product in the
2323 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2324 ///
2325 /// \headerfile <x86intrin.h>
2326 ///
2327 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2328 ///
2329 /// \param __a
2330 /// A 128-bit signed [8 x i16] vector.
2331 /// \param __b
2332 /// A 128-bit signed [8 x i16] vector.
2333 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334 /// each of the eight 32-bit products.
2335 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2336  __m128i __b) {
2337  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2338 }
2339 
2340 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2341 /// vectors, saving the upper 16 bits of each 32-bit product in the
2342 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2343 ///
2344 /// \headerfile <x86intrin.h>
2345 ///
2346 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2347 ///
2348 /// \param __a
2349 /// A 128-bit unsigned [8 x i16] vector.
2350 /// \param __b
2351 /// A 128-bit unsigned [8 x i16] vector.
2352 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353 /// of each of the eight 32-bit products.
2354 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2355  __m128i __b) {
2356  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2357 }
2358 
2359 /// Multiplies the corresponding elements of two signed [8 x i16]
2360 /// vectors, saving the lower 16 bits of each 32-bit product in the
2361 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2362 ///
2363 /// \headerfile <x86intrin.h>
2364 ///
2365 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2366 ///
2367 /// \param __a
2368 /// A 128-bit signed [8 x i16] vector.
2369 /// \param __b
2370 /// A 128-bit signed [8 x i16] vector.
2371 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372 /// each of the eight 32-bit products.
2373 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2374  __m128i __b) {
2375  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2376 }
2377 
2378 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2379 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2380 /// product.
2381 ///
2382 /// \headerfile <x86intrin.h>
2383 ///
2384 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2385 ///
2386 /// \param __a
2387 /// A 64-bit integer containing one of the source operands.
2388 /// \param __b
2389 /// A 64-bit integer containing one of the source operands.
2390 /// \returns A 64-bit integer vector containing the product of both operands.
2391 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2392  __m64 __b) {
2393  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2394 }
2395 
2396 /// Multiplies 32-bit unsigned integer values contained in the lower
2397 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2398 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2399 ///
2400 /// \headerfile <x86intrin.h>
2401 ///
2402 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2403 ///
2404 /// \param __a
2405 /// A [2 x i64] vector containing one of the source operands.
2406 /// \param __b
2407 /// A [2 x i64] vector containing one of the source operands.
2408 /// \returns A [2 x i64] vector containing the product of both operands.
2409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2410  __m128i __b) {
2411  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2412 }
2413 
2414 /// Computes the absolute differences of corresponding 8-bit integer
2415 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416 /// separately sums the second 8 absolute differences. Packs these two
2417 /// unsigned 16-bit integer sums into the upper and lower elements of a
2418 /// [2 x i64] vector.
2419 ///
2420 /// \headerfile <x86intrin.h>
2421 ///
2422 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2423 ///
2424 /// \param __a
2425 /// A 128-bit integer vector containing one of the source operands.
2426 /// \param __b
2427 /// A 128-bit integer vector containing one of the source operands.
2428 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429 /// differences between both operands.
2430 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2431  __m128i __b) {
2432  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2433 }
2434 
2435 /// Subtracts the corresponding 8-bit integer values in the operands.
2436 ///
2437 /// \headerfile <x86intrin.h>
2438 ///
2439 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2440 ///
2441 /// \param __a
2442 /// A 128-bit integer vector containing the minuends.
2443 /// \param __b
2444 /// A 128-bit integer vector containing the subtrahends.
2445 /// \returns A 128-bit integer vector containing the differences of the values
2446 /// in the operands.
2447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2448  __m128i __b) {
2449  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2450 }
2451 
2452 /// Subtracts the corresponding 16-bit integer values in the operands.
2453 ///
2454 /// \headerfile <x86intrin.h>
2455 ///
2456 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2457 ///
2458 /// \param __a
2459 /// A 128-bit integer vector containing the minuends.
2460 /// \param __b
2461 /// A 128-bit integer vector containing the subtrahends.
2462 /// \returns A 128-bit integer vector containing the differences of the values
2463 /// in the operands.
2464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2465  __m128i __b) {
2466  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2467 }
2468 
2469 /// Subtracts the corresponding 32-bit integer values in the operands.
2470 ///
2471 /// \headerfile <x86intrin.h>
2472 ///
2473 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2474 ///
2475 /// \param __a
2476 /// A 128-bit integer vector containing the minuends.
2477 /// \param __b
2478 /// A 128-bit integer vector containing the subtrahends.
2479 /// \returns A 128-bit integer vector containing the differences of the values
2480 /// in the operands.
2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2482  __m128i __b) {
2483  return (__m128i)((__v4su)__a - (__v4su)__b);
2484 }
2485 
2486 /// Subtracts signed or unsigned 64-bit integer values and writes the
2487 /// difference to the corresponding bits in the destination.
2488 ///
2489 /// \headerfile <x86intrin.h>
2490 ///
2491 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2492 ///
2493 /// \param __a
2494 /// A 64-bit integer vector containing the minuend.
2495 /// \param __b
2496 /// A 64-bit integer vector containing the subtrahend.
2497 /// \returns A 64-bit integer vector containing the difference of the values in
2498 /// the operands.
2499 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2500  __m64 __b) {
2501  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2502 }
2503 
2504 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2505 ///
2506 /// \headerfile <x86intrin.h>
2507 ///
2508 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2509 ///
2510 /// \param __a
2511 /// A 128-bit integer vector containing the minuends.
2512 /// \param __b
2513 /// A 128-bit integer vector containing the subtrahends.
2514 /// \returns A 128-bit integer vector containing the differences of the values
2515 /// in the operands.
2516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2517  __m128i __b) {
2518  return (__m128i)((__v2du)__a - (__v2du)__b);
2519 }
2520 
2521 /// Subtracts corresponding 8-bit signed integer values in the input and
2522 /// returns the differences in the corresponding bytes in the destination.
2523 /// Differences greater than 0x7F are saturated to 0x7F, and differences less
2524 /// than 0x80 are saturated to 0x80.
2525 ///
2526 /// \headerfile <x86intrin.h>
2527 ///
2528 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2529 ///
2530 /// \param __a
2531 /// A 128-bit integer vector containing the minuends.
2532 /// \param __b
2533 /// A 128-bit integer vector containing the subtrahends.
2534 /// \returns A 128-bit integer vector containing the differences of the values
2535 /// in the operands.
2536 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2537  __m128i __b) {
2538  return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2539 }
2540 
2541 /// Subtracts corresponding 16-bit signed integer values in the input and
2542 /// returns the differences in the corresponding bytes in the destination.
2543 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544 /// than 0x8000 are saturated to 0x8000.
2545 ///
2546 /// \headerfile <x86intrin.h>
2547 ///
2548 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2549 ///
2550 /// \param __a
2551 /// A 128-bit integer vector containing the minuends.
2552 /// \param __b
2553 /// A 128-bit integer vector containing the subtrahends.
2554 /// \returns A 128-bit integer vector containing the differences of the values
2555 /// in the operands.
2556 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2557  __m128i __b) {
2558  return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2559 }
2560 
2561 /// Subtracts corresponding 8-bit unsigned integer values in the input
2562 /// and returns the differences in the corresponding bytes in the
2563 /// destination. Differences less than 0x00 are saturated to 0x00.
2564 ///
2565 /// \headerfile <x86intrin.h>
2566 ///
2567 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2568 ///
2569 /// \param __a
2570 /// A 128-bit integer vector containing the minuends.
2571 /// \param __b
2572 /// A 128-bit integer vector containing the subtrahends.
2573 /// \returns A 128-bit integer vector containing the unsigned integer
2574 /// differences of the values in the operands.
2575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2576  __m128i __b) {
2577  return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2578 }
2579 
2580 /// Subtracts corresponding 16-bit unsigned integer values in the input
2581 /// and returns the differences in the corresponding bytes in the
2582 /// destination. Differences less than 0x0000 are saturated to 0x0000.
2583 ///
2584 /// \headerfile <x86intrin.h>
2585 ///
2586 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2587 ///
2588 /// \param __a
2589 /// A 128-bit integer vector containing the minuends.
2590 /// \param __b
2591 /// A 128-bit integer vector containing the subtrahends.
2592 /// \returns A 128-bit integer vector containing the unsigned integer
2593 /// differences of the values in the operands.
2594 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2595  __m128i __b) {
2596  return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2597 }
2598 
2599 /// Performs a bitwise AND of two 128-bit integer vectors.
2600 ///
2601 /// \headerfile <x86intrin.h>
2602 ///
2603 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2604 ///
2605 /// \param __a
2606 /// A 128-bit integer vector containing one of the source operands.
2607 /// \param __b
2608 /// A 128-bit integer vector containing one of the source operands.
2609 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2610 /// in both operands.
2611 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2612  __m128i __b) {
2613  return (__m128i)((__v2du)__a & (__v2du)__b);
2614 }
2615 
2616 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2617 /// one's complement of the values contained in the first source operand.
2618 ///
2619 /// \headerfile <x86intrin.h>
2620 ///
2621 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2622 ///
2623 /// \param __a
2624 /// A 128-bit vector containing the left source operand. The one's complement
2625 /// of this value is used in the bitwise AND.
2626 /// \param __b
2627 /// A 128-bit vector containing the right source operand.
2628 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629 /// complement of the first operand and the values in the second operand.
2630 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2631  __m128i __b) {
2632  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2633 }
2634 /// Performs a bitwise OR of two 128-bit integer vectors.
2635 ///
2636 /// \headerfile <x86intrin.h>
2637 ///
2638 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2639 ///
2640 /// \param __a
2641 /// A 128-bit integer vector containing one of the source operands.
2642 /// \param __b
2643 /// A 128-bit integer vector containing one of the source operands.
2644 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2645 /// in both operands.
2646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2647  __m128i __b) {
2648  return (__m128i)((__v2du)__a | (__v2du)__b);
2649 }
2650 
2651 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2652 ///
2653 /// \headerfile <x86intrin.h>
2654 ///
2655 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2656 ///
2657 /// \param __a
2658 /// A 128-bit integer vector containing one of the source operands.
2659 /// \param __b
2660 /// A 128-bit integer vector containing one of the source operands.
2661 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662 /// values in both operands.
2663 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2664  __m128i __b) {
2665  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2666 }
2667 
2668 /// Left-shifts the 128-bit integer vector operand by the specified
2669 /// number of bytes. Low-order bits are cleared.
2670 ///
2671 /// \headerfile <x86intrin.h>
2672 ///
2673 /// \code
2674 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2675 /// \endcode
2676 ///
2677 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2678 ///
2679 /// \param a
2680 /// A 128-bit integer vector containing the source operand.
2681 /// \param imm
2682 /// An immediate value specifying the number of bytes to left-shift operand
2683 /// \a a.
2684 /// \returns A 128-bit integer vector containing the left-shifted value.
2685 #define _mm_slli_si128(a, imm) \
2686  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2687  (int)(imm)))
2688 
2689 #define _mm_bslli_si128(a, imm) \
2690  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2691  (int)(imm)))
2692 
2693 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694 /// by the specified number of bits. Low-order bits are cleared.
2695 ///
2696 /// \headerfile <x86intrin.h>
2697 ///
2698 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2699 ///
2700 /// \param __a
2701 /// A 128-bit integer vector containing the source operand.
2702 /// \param __count
2703 /// An integer value specifying the number of bits to left-shift each value
2704 /// in operand \a __a.
2705 /// \returns A 128-bit integer vector containing the left-shifted values.
2706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2707  int __count) {
2708  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2709 }
2710 
2711 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712 /// by the specified number of bits. Low-order bits are cleared.
2713 ///
2714 /// \headerfile <x86intrin.h>
2715 ///
2716 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2717 ///
2718 /// \param __a
2719 /// A 128-bit integer vector containing the source operand.
2720 /// \param __count
2721 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2722 /// to left-shift each value in operand \a __a.
2723 /// \returns A 128-bit integer vector containing the left-shifted values.
2724 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2725  __m128i __count) {
2726  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2727 }
2728 
2729 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730 /// by the specified number of bits. Low-order bits are cleared.
2731 ///
2732 /// \headerfile <x86intrin.h>
2733 ///
2734 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2735 ///
2736 /// \param __a
2737 /// A 128-bit integer vector containing the source operand.
2738 /// \param __count
2739 /// An integer value specifying the number of bits to left-shift each value
2740 /// in operand \a __a.
2741 /// \returns A 128-bit integer vector containing the left-shifted values.
2742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2743  int __count) {
2744  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2745 }
2746 
2747 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748 /// by the specified number of bits. Low-order bits are cleared.
2749 ///
2750 /// \headerfile <x86intrin.h>
2751 ///
2752 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2753 ///
2754 /// \param __a
2755 /// A 128-bit integer vector containing the source operand.
2756 /// \param __count
2757 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2758 /// to left-shift each value in operand \a __a.
2759 /// \returns A 128-bit integer vector containing the left-shifted values.
2760 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2761  __m128i __count) {
2762  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2763 }
2764 
2765 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766 /// by the specified number of bits. Low-order bits are cleared.
2767 ///
2768 /// \headerfile <x86intrin.h>
2769 ///
2770 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2771 ///
2772 /// \param __a
2773 /// A 128-bit integer vector containing the source operand.
2774 /// \param __count
2775 /// An integer value specifying the number of bits to left-shift each value
2776 /// in operand \a __a.
2777 /// \returns A 128-bit integer vector containing the left-shifted values.
2778 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2779  int __count) {
2780  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2781 }
2782 
2783 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784 /// by the specified number of bits. Low-order bits are cleared.
2785 ///
2786 /// \headerfile <x86intrin.h>
2787 ///
2788 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2789 ///
2790 /// \param __a
2791 /// A 128-bit integer vector containing the source operand.
2792 /// \param __count
2793 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2794 /// to left-shift each value in operand \a __a.
2795 /// \returns A 128-bit integer vector containing the left-shifted values.
2796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2797  __m128i __count) {
2798  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2799 }
2800 
2801 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802 /// by the specified number of bits. High-order bits are filled with the sign
2803 /// bit of the initial value.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2808 ///
2809 /// \param __a
2810 /// A 128-bit integer vector containing the source operand.
2811 /// \param __count
2812 /// An integer value specifying the number of bits to right-shift each value
2813 /// in operand \a __a.
2814 /// \returns A 128-bit integer vector containing the right-shifted values.
2815 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2816  int __count) {
2817  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2818 }
2819 
2820 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821 /// by the specified number of bits. High-order bits are filled with the sign
2822 /// bit of the initial value.
2823 ///
2824 /// \headerfile <x86intrin.h>
2825 ///
2826 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2827 ///
2828 /// \param __a
2829 /// A 128-bit integer vector containing the source operand.
2830 /// \param __count
2831 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2832 /// to right-shift each value in operand \a __a.
2833 /// \returns A 128-bit integer vector containing the right-shifted values.
2834 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2835  __m128i __count) {
2836  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2837 }
2838 
2839 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840 /// by the specified number of bits. High-order bits are filled with the sign
2841 /// bit of the initial value.
2842 ///
2843 /// \headerfile <x86intrin.h>
2844 ///
2845 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2846 ///
2847 /// \param __a
2848 /// A 128-bit integer vector containing the source operand.
2849 /// \param __count
2850 /// An integer value specifying the number of bits to right-shift each value
2851 /// in operand \a __a.
2852 /// \returns A 128-bit integer vector containing the right-shifted values.
2853 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2854  int __count) {
2855  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2856 }
2857 
2858 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859 /// by the specified number of bits. High-order bits are filled with the sign
2860 /// bit of the initial value.
2861 ///
2862 /// \headerfile <x86intrin.h>
2863 ///
2864 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2865 ///
2866 /// \param __a
2867 /// A 128-bit integer vector containing the source operand.
2868 /// \param __count
2869 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2870 /// to right-shift each value in operand \a __a.
2871 /// \returns A 128-bit integer vector containing the right-shifted values.
2872 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2873  __m128i __count) {
2874  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2875 }
2876 
2877 /// Right-shifts the 128-bit integer vector operand by the specified
2878 /// number of bytes. High-order bits are cleared.
2879 ///
2880 /// \headerfile <x86intrin.h>
2881 ///
2882 /// \code
2883 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2884 /// \endcode
2885 ///
2886 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2887 ///
2888 /// \param a
2889 /// A 128-bit integer vector containing the source operand.
2890 /// \param imm
2891 /// An immediate value specifying the number of bytes to right-shift operand
2892 /// \a a.
2893 /// \returns A 128-bit integer vector containing the right-shifted value.
2894 #define _mm_srli_si128(a, imm) \
2895  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2896  (int)(imm)))
2897 
2898 #define _mm_bsrli_si128(a, imm) \
2899  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2900  (int)(imm)))
2901 
2902 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2903 /// operand by the specified number of bits. High-order bits are cleared.
2904 ///
2905 /// \headerfile <x86intrin.h>
2906 ///
2907 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2908 ///
2909 /// \param __a
2910 /// A 128-bit integer vector containing the source operand.
2911 /// \param __count
2912 /// An integer value specifying the number of bits to right-shift each value
2913 /// in operand \a __a.
2914 /// \returns A 128-bit integer vector containing the right-shifted values.
2915 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2916  int __count) {
2917  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2918 }
2919 
2920 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2921 /// operand by the specified number of bits. High-order bits are cleared.
2922 ///
2923 /// \headerfile <x86intrin.h>
2924 ///
2925 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2926 ///
2927 /// \param __a
2928 /// A 128-bit integer vector containing the source operand.
2929 /// \param __count
2930 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2931 /// to right-shift each value in operand \a __a.
2932 /// \returns A 128-bit integer vector containing the right-shifted values.
2933 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2934  __m128i __count) {
2935  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2936 }
2937 
2938 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2939 /// operand by the specified number of bits. High-order bits are cleared.
2940 ///
2941 /// \headerfile <x86intrin.h>
2942 ///
2943 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2944 ///
2945 /// \param __a
2946 /// A 128-bit integer vector containing the source operand.
2947 /// \param __count
2948 /// An integer value specifying the number of bits to right-shift each value
2949 /// in operand \a __a.
2950 /// \returns A 128-bit integer vector containing the right-shifted values.
2951 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2952  int __count) {
2953  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2954 }
2955 
2956 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2957 /// operand by the specified number of bits. High-order bits are cleared.
2958 ///
2959 /// \headerfile <x86intrin.h>
2960 ///
2961 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2962 ///
2963 /// \param __a
2964 /// A 128-bit integer vector containing the source operand.
2965 /// \param __count
2966 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2967 /// to right-shift each value in operand \a __a.
2968 /// \returns A 128-bit integer vector containing the right-shifted values.
2969 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2970  __m128i __count) {
2971  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2972 }
2973 
2974 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2975 /// operand by the specified number of bits. High-order bits are cleared.
2976 ///
2977 /// \headerfile <x86intrin.h>
2978 ///
2979 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2980 ///
2981 /// \param __a
2982 /// A 128-bit integer vector containing the source operand.
2983 /// \param __count
2984 /// An integer value specifying the number of bits to right-shift each value
2985 /// in operand \a __a.
2986 /// \returns A 128-bit integer vector containing the right-shifted values.
2987 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2988  int __count) {
2989  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2990 }
2991 
2992 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2993 /// operand by the specified number of bits. High-order bits are cleared.
2994 ///
2995 /// \headerfile <x86intrin.h>
2996 ///
2997 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2998 ///
2999 /// \param __a
3000 /// A 128-bit integer vector containing the source operand.
3001 /// \param __count
3002 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3003 /// to right-shift each value in operand \a __a.
3004 /// \returns A 128-bit integer vector containing the right-shifted values.
3005 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3006  __m128i __count) {
3007  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3008 }
3009 
3010 /// Compares each of the corresponding 8-bit values of the 128-bit
3011 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3012 /// for true.
3013 ///
3014 /// \headerfile <x86intrin.h>
3015 ///
3016 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3017 ///
3018 /// \param __a
3019 /// A 128-bit integer vector.
3020 /// \param __b
3021 /// A 128-bit integer vector.
3022 /// \returns A 128-bit integer vector containing the comparison results.
3023 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3024  __m128i __b) {
3025  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3026 }
3027 
3028 /// Compares each of the corresponding 16-bit values of the 128-bit
3029 /// integer vectors for equality. Each comparison yields 0x0 for false,
3030 /// 0xFFFF for true.
3031 ///
3032 /// \headerfile <x86intrin.h>
3033 ///
3034 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3035 ///
3036 /// \param __a
3037 /// A 128-bit integer vector.
3038 /// \param __b
3039 /// A 128-bit integer vector.
3040 /// \returns A 128-bit integer vector containing the comparison results.
3041 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3042  __m128i __b) {
3043  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3044 }
3045 
3046 /// Compares each of the corresponding 32-bit values of the 128-bit
3047 /// integer vectors for equality. Each comparison yields 0x0 for false,
3048 /// 0xFFFFFFFF for true.
3049 ///
3050 /// \headerfile <x86intrin.h>
3051 ///
3052 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3053 ///
3054 /// \param __a
3055 /// A 128-bit integer vector.
3056 /// \param __b
3057 /// A 128-bit integer vector.
3058 /// \returns A 128-bit integer vector containing the comparison results.
3059 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3060  __m128i __b) {
3061  return (__m128i)((__v4si)__a == (__v4si)__b);
3062 }
3063 
3064 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3065 /// integer vectors to determine if the values in the first operand are
3066 /// greater than those in the second operand. Each comparison yields 0x0 for
3067 /// false, 0xFF for true.
3068 ///
3069 /// \headerfile <x86intrin.h>
3070 ///
3071 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3072 ///
3073 /// \param __a
3074 /// A 128-bit integer vector.
3075 /// \param __b
3076 /// A 128-bit integer vector.
3077 /// \returns A 128-bit integer vector containing the comparison results.
3078 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3079  __m128i __b) {
3080  /* This function always performs a signed comparison, but __v16qi is a char
3081  which may be signed or unsigned, so use __v16qs. */
3082  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3083 }
3084 
3085 /// Compares each of the corresponding signed 16-bit values of the
3086 /// 128-bit integer vectors to determine if the values in the first operand
3087 /// are greater than those in the second operand.
3088 ///
3089 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3090 ///
3091 /// \headerfile <x86intrin.h>
3092 ///
3093 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3094 ///
3095 /// \param __a
3096 /// A 128-bit integer vector.
3097 /// \param __b
3098 /// A 128-bit integer vector.
3099 /// \returns A 128-bit integer vector containing the comparison results.
3100 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3101  __m128i __b) {
3102  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3103 }
3104 
3105 /// Compares each of the corresponding signed 32-bit values of the
3106 /// 128-bit integer vectors to determine if the values in the first operand
3107 /// are greater than those in the second operand.
3108 ///
3109 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3110 ///
3111 /// \headerfile <x86intrin.h>
3112 ///
3113 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3114 ///
3115 /// \param __a
3116 /// A 128-bit integer vector.
3117 /// \param __b
3118 /// A 128-bit integer vector.
3119 /// \returns A 128-bit integer vector containing the comparison results.
3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3121  __m128i __b) {
3122  return (__m128i)((__v4si)__a > (__v4si)__b);
3123 }
3124 
3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3126 /// integer vectors to determine if the values in the first operand are less
3127 /// than those in the second operand.
3128 ///
3129 /// Each comparison yields 0x0 for false, 0xFF for true.
3130 ///
3131 /// \headerfile <x86intrin.h>
3132 ///
3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134 ///
3135 /// \param __a
3136 /// A 128-bit integer vector.
3137 /// \param __b
3138 /// A 128-bit integer vector.
3139 /// \returns A 128-bit integer vector containing the comparison results.
3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3141  __m128i __b) {
3142  return _mm_cmpgt_epi8(__b, __a);
3143 }
3144 
3145 /// Compares each of the corresponding signed 16-bit values of the
3146 /// 128-bit integer vectors to determine if the values in the first operand
3147 /// are less than those in the second operand.
3148 ///
3149 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3150 ///
3151 /// \headerfile <x86intrin.h>
3152 ///
3153 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3154 ///
3155 /// \param __a
3156 /// A 128-bit integer vector.
3157 /// \param __b
3158 /// A 128-bit integer vector.
3159 /// \returns A 128-bit integer vector containing the comparison results.
3160 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3161  __m128i __b) {
3162  return _mm_cmpgt_epi16(__b, __a);
3163 }
3164 
3165 /// Compares each of the corresponding signed 32-bit values of the
3166 /// 128-bit integer vectors to determine if the values in the first operand
3167 /// are less than those in the second operand.
3168 ///
3169 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3170 ///
3171 /// \headerfile <x86intrin.h>
3172 ///
3173 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3174 ///
3175 /// \param __a
3176 /// A 128-bit integer vector.
3177 /// \param __b
3178 /// A 128-bit integer vector.
3179 /// \returns A 128-bit integer vector containing the comparison results.
3180 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3181  __m128i __b) {
3182  return _mm_cmpgt_epi32(__b, __a);
3183 }
3184 
3185 #ifdef __x86_64__
3186 /// Converts a 64-bit signed integer value from the second operand into a
3187 /// double-precision value and returns it in the lower element of a [2 x
3188 /// double] vector; the upper element of the returned vector is copied from
3189 /// the upper element of the first operand.
3190 ///
3191 /// \headerfile <x86intrin.h>
3192 ///
3193 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3194 ///
3195 /// \param __a
3196 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197 /// copied to the upper 64 bits of the destination.
3198 /// \param __b
3199 /// A 64-bit signed integer operand containing the value to be converted.
3200 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201 /// converted value of the second operand. The upper 64 bits are copied from
3202 /// the upper 64 bits of the first operand.
3203 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3204  long long __b) {
3205  __a[0] = __b;
3206  return __a;
3207 }
3208 
3209 /// Converts the first (lower) element of a vector of [2 x double] into a
3210 /// 64-bit signed integer value, according to the current rounding mode.
3211 ///
3212 /// \headerfile <x86intrin.h>
3213 ///
3214 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3215 ///
3216 /// \param __a
3217 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3218 /// conversion.
3219 /// \returns A 64-bit signed integer containing the converted value.
3220 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3221  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3222 }
3223 
3224 /// Converts the first (lower) element of a vector of [2 x double] into a
3225 /// 64-bit signed integer value, truncating the result when it is inexact.
3226 ///
3227 /// \headerfile <x86intrin.h>
3228 ///
3229 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3230 /// instruction.
3231 ///
3232 /// \param __a
3233 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3234 /// conversion.
3235 /// \returns A 64-bit signed integer containing the converted value.
3236 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3237  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3238 }
3239 #endif
3240 
3241 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3242 ///
3243 /// \headerfile <x86intrin.h>
3244 ///
3245 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3246 ///
3247 /// \param __a
3248 /// A 128-bit integer vector.
3249 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3250 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3251  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3252 }
3253 
3254 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3255 ///
3256 /// \headerfile <x86intrin.h>
3257 ///
3258 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3259 ///
3260 /// \param __a
3261 /// A 128-bit vector of [4 x float].
3262 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3263 /// values.
3264 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3265  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3266 }
3267 
3268 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3269 /// truncating the result when it is inexact.
3270 ///
3271 /// \headerfile <x86intrin.h>
3272 ///
3273 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3274 /// instruction.
3275 ///
3276 /// \param __a
3277 /// A 128-bit vector of [4 x float].
3278 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3280  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3281 }
3282 
3283 /// Returns a vector of [4 x i32] where the lowest element is the input
3284 /// operand and the remaining elements are zero.
3285 ///
3286 /// \headerfile <x86intrin.h>
3287 ///
3288 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3289 ///
3290 /// \param __a
3291 /// A 32-bit signed integer operand.
3292 /// \returns A 128-bit vector of [4 x i32].
3293 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3294  return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3295 }
3296 
3297 /// Returns a vector of [2 x i64] where the lower element is the input
3298 /// operand and the upper element is zero.
3299 ///
3300 /// \headerfile <x86intrin.h>
3301 ///
3302 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3303 /// in 64-bit mode.
3304 ///
3305 /// \param __a
3306 /// A 64-bit signed integer operand containing the value to be converted.
3307 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3309  return __extension__(__m128i)(__v2di){__a, 0};
3310 }
3311 
3312 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313 /// 32-bit signed integer value.
3314 ///
3315 /// \headerfile <x86intrin.h>
3316 ///
3317 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3318 ///
3319 /// \param __a
3320 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3321 /// destination.
3322 /// \returns A 32-bit signed integer containing the moved value.
3323 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3324  __v4si __b = (__v4si)__a;
3325  return __b[0];
3326 }
3327 
3328 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329 /// 64-bit signed integer value.
3330 ///
3331 /// \headerfile <x86intrin.h>
3332 ///
3333 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3334 ///
3335 /// \param __a
3336 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3337 /// destination.
3338 /// \returns A 64-bit signed integer containing the moved value.
3339 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3340  return __a[0];
3341 }
3342 
3343 /// Moves packed integer values from an aligned 128-bit memory location
3344 /// to elements in a 128-bit integer vector.
3345 ///
3346 /// \headerfile <x86intrin.h>
3347 ///
3348 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3349 ///
3350 /// \param __p
3351 /// An aligned pointer to a memory location containing integer values.
3352 /// \returns A 128-bit integer vector containing the moved values.
3353 static __inline__ __m128i __DEFAULT_FN_ATTRS
3354 _mm_load_si128(__m128i const *__p) {
3355  return *__p;
3356 }
3357 
3358 /// Moves packed integer values from an unaligned 128-bit memory location
3359 /// to elements in a 128-bit integer vector.
3360 ///
3361 /// \headerfile <x86intrin.h>
3362 ///
3363 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3364 ///
3365 /// \param __p
3366 /// A pointer to a memory location containing integer values.
3367 /// \returns A 128-bit integer vector containing the moved values.
3368 static __inline__ __m128i __DEFAULT_FN_ATTRS
3369 _mm_loadu_si128(__m128i_u const *__p) {
3370  struct __loadu_si128 {
3371  __m128i_u __v;
3372  } __attribute__((__packed__, __may_alias__));
3373  return ((const struct __loadu_si128 *)__p)->__v;
3374 }
3375 
3376 /// Returns a vector of [2 x i64] where the lower element is taken from
3377 /// the lower element of the operand, and the upper element is zero.
3378 ///
3379 /// \headerfile <x86intrin.h>
3380 ///
3381 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3382 ///
3383 /// \param __p
3384 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385 /// the destination.
3386 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387 /// moved value. The higher order bits are cleared.
3388 static __inline__ __m128i __DEFAULT_FN_ATTRS
3389 _mm_loadl_epi64(__m128i_u const *__p) {
3390  struct __mm_loadl_epi64_struct {
3391  long long __u;
3392  } __attribute__((__packed__, __may_alias__));
3393  return __extension__(__m128i){
3394  ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3395 }
3396 
3397 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398 /// This could be used as an argument to another intrinsic function where the
3399 /// argument is required but the value is not actually used.
3400 ///
3401 /// \headerfile <x86intrin.h>
3402 ///
3403 /// This intrinsic has no corresponding instruction.
3404 ///
3405 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3407  return (__m128i)__builtin_ia32_undef128();
3408 }
3409 
3410 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411 /// the specified 64-bit integer values.
3412 ///
3413 /// \headerfile <x86intrin.h>
3414 ///
3415 /// This intrinsic is a utility function and does not correspond to a specific
3416 /// instruction.
3417 ///
3418 /// \param __q1
3419 /// A 64-bit integer value used to initialize the upper 64 bits of the
3420 /// destination vector of [2 x i64].
3421 /// \param __q0
3422 /// A 64-bit integer value used to initialize the lower 64 bits of the
3423 /// destination vector of [2 x i64].
3424 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425 /// provided in the operands.
3426 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3427  long long __q0) {
3428  return __extension__(__m128i)(__v2di){__q0, __q1};
3429 }
3430 
3431 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432 /// the specified 64-bit integer values.
3433 ///
3434 /// \headerfile <x86intrin.h>
3435 ///
3436 /// This intrinsic is a utility function and does not correspond to a specific
3437 /// instruction.
3438 ///
3439 /// \param __q1
3440 /// A 64-bit integer value used to initialize the upper 64 bits of the
3441 /// destination vector of [2 x i64].
3442 /// \param __q0
3443 /// A 64-bit integer value used to initialize the lower 64 bits of the
3444 /// destination vector of [2 x i64].
3445 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446 /// provided in the operands.
3447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3448  __m64 __q0) {
3449  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3450 }
3451 
3452 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453 /// the specified 32-bit integer values.
3454 ///
3455 /// \headerfile <x86intrin.h>
3456 ///
3457 /// This intrinsic is a utility function and does not correspond to a specific
3458 /// instruction.
3459 ///
3460 /// \param __i3
3461 /// A 32-bit integer value used to initialize bits [127:96] of the
3462 /// destination vector.
3463 /// \param __i2
3464 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3465 /// vector.
3466 /// \param __i1
3467 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3468 /// vector.
3469 /// \param __i0
3470 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3471 /// vector.
3472 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473 /// provided in the operands.
3474 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3475  int __i1, int __i0) {
3476  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3477 }
3478 
3479 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480 /// the specified 16-bit integer values.
3481 ///
3482 /// \headerfile <x86intrin.h>
3483 ///
3484 /// This intrinsic is a utility function and does not correspond to a specific
3485 /// instruction.
3486 ///
3487 /// \param __w7
3488 /// A 16-bit integer value used to initialize bits [127:112] of the
3489 /// destination vector.
3490 /// \param __w6
3491 /// A 16-bit integer value used to initialize bits [111:96] of the
3492 /// destination vector.
3493 /// \param __w5
3494 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3495 /// vector.
3496 /// \param __w4
3497 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3498 /// vector.
3499 /// \param __w3
3500 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3501 /// vector.
3502 /// \param __w2
3503 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3504 /// vector.
3505 /// \param __w1
3506 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3507 /// vector.
3508 /// \param __w0
3509 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3510 /// vector.
3511 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512 /// provided in the operands.
3513 static __inline__ __m128i __DEFAULT_FN_ATTRS
3514 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3515  short __w2, short __w1, short __w0) {
3516  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3517  __w4, __w5, __w6, __w7};
3518 }
3519 
3520 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521 /// the specified 8-bit integer values.
3522 ///
3523 /// \headerfile <x86intrin.h>
3524 ///
3525 /// This intrinsic is a utility function and does not correspond to a specific
3526 /// instruction.
3527 ///
3528 /// \param __b15
3529 /// Initializes bits [127:120] of the destination vector.
3530 /// \param __b14
3531 /// Initializes bits [119:112] of the destination vector.
3532 /// \param __b13
3533 /// Initializes bits [111:104] of the destination vector.
3534 /// \param __b12
3535 /// Initializes bits [103:96] of the destination vector.
3536 /// \param __b11
3537 /// Initializes bits [95:88] of the destination vector.
3538 /// \param __b10
3539 /// Initializes bits [87:80] of the destination vector.
3540 /// \param __b9
3541 /// Initializes bits [79:72] of the destination vector.
3542 /// \param __b8
3543 /// Initializes bits [71:64] of the destination vector.
3544 /// \param __b7
3545 /// Initializes bits [63:56] of the destination vector.
3546 /// \param __b6
3547 /// Initializes bits [55:48] of the destination vector.
3548 /// \param __b5
3549 /// Initializes bits [47:40] of the destination vector.
3550 /// \param __b4
3551 /// Initializes bits [39:32] of the destination vector.
3552 /// \param __b3
3553 /// Initializes bits [31:24] of the destination vector.
3554 /// \param __b2
3555 /// Initializes bits [23:16] of the destination vector.
3556 /// \param __b1
3557 /// Initializes bits [15:8] of the destination vector.
3558 /// \param __b0
3559 /// Initializes bits [7:0] of the destination vector.
3560 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561 /// provided in the operands.
3562 static __inline__ __m128i __DEFAULT_FN_ATTRS
3563 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3564  char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3565  char __b4, char __b3, char __b2, char __b1, char __b0) {
3566  return __extension__(__m128i)(__v16qi){
3567  __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3568  __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3569 }
3570 
3571 /// Initializes both values in a 128-bit integer vector with the
3572 /// specified 64-bit integer value.
3573 ///
3574 /// \headerfile <x86intrin.h>
3575 ///
3576 /// This intrinsic is a utility function and does not correspond to a specific
3577 /// instruction.
3578 ///
3579 /// \param __q
3580 /// Integer value used to initialize the elements of the destination integer
3581 /// vector.
3582 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583 /// elements containing the value provided in the operand.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3585  return _mm_set_epi64x(__q, __q);
3586 }
3587 
3588 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3589 /// specified 64-bit value.
3590 ///
3591 /// \headerfile <x86intrin.h>
3592 ///
3593 /// This intrinsic is a utility function and does not correspond to a specific
3594 /// instruction.
3595 ///
3596 /// \param __q
3597 /// A 64-bit value used to initialize the elements of the destination integer
3598 /// vector.
3599 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600 /// containing the value provided in the operand.
3601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3602  return _mm_set_epi64(__q, __q);
3603 }
3604 
3605 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3606 /// specified 32-bit value.
3607 ///
3608 /// \headerfile <x86intrin.h>
3609 ///
3610 /// This intrinsic is a utility function and does not correspond to a specific
3611 /// instruction.
3612 ///
3613 /// \param __i
3614 /// A 32-bit value used to initialize the elements of the destination integer
3615 /// vector.
3616 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617 /// containing the value provided in the operand.
3618 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3619  return _mm_set_epi32(__i, __i, __i, __i);
3620 }
3621 
3622 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3623 /// specified 16-bit value.
3624 ///
3625 /// \headerfile <x86intrin.h>
3626 ///
3627 /// This intrinsic is a utility function and does not correspond to a specific
3628 /// instruction.
3629 ///
3630 /// \param __w
3631 /// A 16-bit value used to initialize the elements of the destination integer
3632 /// vector.
3633 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634 /// containing the value provided in the operand.
3635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3636  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3637 }
3638 
3639 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3640 /// specified 8-bit value.
3641 ///
3642 /// \headerfile <x86intrin.h>
3643 ///
3644 /// This intrinsic is a utility function and does not correspond to a specific
3645 /// instruction.
3646 ///
3647 /// \param __b
3648 /// An 8-bit value used to initialize the elements of the destination integer
3649 /// vector.
3650 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651 /// containing the value provided in the operand.
3652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3653  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3654  __b, __b, __b, __b, __b);
3655 }
3656 
3657 /// Constructs a 128-bit integer vector, initialized in reverse order
3658 /// with the specified 64-bit integral values.
3659 ///
3660 /// \headerfile <x86intrin.h>
3661 ///
3662 /// This intrinsic does not correspond to a specific instruction.
3663 ///
3664 /// \param __q0
3665 /// A 64-bit integral value used to initialize the lower 64 bits of the
3666 /// result.
3667 /// \param __q1
3668 /// A 64-bit integral value used to initialize the upper 64 bits of the
3669 /// result.
3670 /// \returns An initialized 128-bit integer vector.
3671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3672  __m64 __q1) {
3673  return _mm_set_epi64(__q1, __q0);
3674 }
3675 
3676 /// Constructs a 128-bit integer vector, initialized in reverse order
3677 /// with the specified 32-bit integral values.
3678 ///
3679 /// \headerfile <x86intrin.h>
3680 ///
3681 /// This intrinsic is a utility function and does not correspond to a specific
3682 /// instruction.
3683 ///
3684 /// \param __i0
3685 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3686 /// \param __i1
3687 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3688 /// \param __i2
3689 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3690 /// \param __i3
3691 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3692 /// \returns An initialized 128-bit integer vector.
3693 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3694  int __i2,
3695  int __i3) {
3696  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3697 }
3698 
3699 /// Constructs a 128-bit integer vector, initialized in reverse order
3700 /// with the specified 16-bit integral values.
3701 ///
3702 /// \headerfile <x86intrin.h>
3703 ///
3704 /// This intrinsic is a utility function and does not correspond to a specific
3705 /// instruction.
3706 ///
3707 /// \param __w0
3708 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3709 /// \param __w1
3710 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3711 /// \param __w2
3712 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3713 /// \param __w3
3714 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3715 /// \param __w4
3716 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3717 /// \param __w5
3718 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3719 /// \param __w6
3720 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3721 /// \param __w7
3722 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3723 /// \returns An initialized 128-bit integer vector.
3724 static __inline__ __m128i __DEFAULT_FN_ATTRS
3725 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3726  short __w5, short __w6, short __w7) {
3727  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3728 }
3729 
3730 /// Constructs a 128-bit integer vector, initialized in reverse order
3731 /// with the specified 8-bit integral values.
3732 ///
3733 /// \headerfile <x86intrin.h>
3734 ///
3735 /// This intrinsic is a utility function and does not correspond to a specific
3736 /// instruction.
3737 ///
3738 /// \param __b0
3739 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3740 /// \param __b1
3741 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3742 /// \param __b2
3743 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3744 /// \param __b3
3745 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3746 /// \param __b4
3747 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3748 /// \param __b5
3749 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3750 /// \param __b6
3751 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3752 /// \param __b7
3753 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3754 /// \param __b8
3755 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3756 /// \param __b9
3757 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3758 /// \param __b10
3759 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3760 /// \param __b11
3761 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3762 /// \param __b12
3763 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3764 /// \param __b13
3765 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3766 /// \param __b14
3767 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3768 /// \param __b15
3769 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3770 /// \returns An initialized 128-bit integer vector.
3771 static __inline__ __m128i __DEFAULT_FN_ATTRS
3772 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3773  char __b6, char __b7, char __b8, char __b9, char __b10,
3774  char __b11, char __b12, char __b13, char __b14, char __b15) {
3775  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3776  __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3777 }
3778 
3779 /// Creates a 128-bit integer vector initialized to zero.
3780 ///
3781 /// \headerfile <x86intrin.h>
3782 ///
3783 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3784 ///
3785 /// \returns An initialized 128-bit integer vector with all elements set to
3786 /// zero.
3787 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3788  return __extension__(__m128i)(__v2di){0LL, 0LL};
3789 }
3790 
3791 /// Stores a 128-bit integer vector to a memory location aligned on a
3792 /// 128-bit boundary.
3793 ///
3794 /// \headerfile <x86intrin.h>
3795 ///
3796 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3797 ///
3798 /// \param __p
3799 /// A pointer to an aligned memory location that will receive the integer
3800 /// values.
3801 /// \param __b
3802 /// A 128-bit integer vector containing the values to be moved.
3803 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3804  __m128i __b) {
3805  *__p = __b;
3806 }
3807 
3808 /// Stores a 128-bit integer vector to an unaligned memory location.
3809 ///
3810 /// \headerfile <x86intrin.h>
3811 ///
3812 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3813 ///
3814 /// \param __p
3815 /// A pointer to a memory location that will receive the integer values.
3816 /// \param __b
3817 /// A 128-bit integer vector containing the values to be moved.
3818 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3819  __m128i __b) {
3820  struct __storeu_si128 {
3821  __m128i_u __v;
3822  } __attribute__((__packed__, __may_alias__));
3823  ((struct __storeu_si128 *)__p)->__v = __b;
3824 }
3825 
3826 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3827 /// vector.
3828 ///
3829 /// \headerfile <x86intrin.h>
3830 ///
3831 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3832 ///
3833 /// \param __p
3834 /// A pointer to a 64-bit memory location. The address of the memory
3835 /// location does not have to be aligned.
3836 /// \param __b
3837 /// A 128-bit integer vector containing the value to be stored.
3838 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3839  __m128i __b) {
3840  struct __storeu_si64 {
3841  long long __v;
3842  } __attribute__((__packed__, __may_alias__));
3843  ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3844 }
3845 
3846 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3847 /// vector.
3848 ///
3849 /// \headerfile <x86intrin.h>
3850 ///
3851 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3852 ///
3853 /// \param __p
3854 /// A pointer to a 32-bit memory location. The address of the memory
3855 /// location does not have to be aligned.
3856 /// \param __b
3857 /// A 128-bit integer vector containing the value to be stored.
3858 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3859  __m128i __b) {
3860  struct __storeu_si32 {
3861  int __v;
3862  } __attribute__((__packed__, __may_alias__));
3863  ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3864 }
3865 
3866 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3867 /// vector.
3868 ///
3869 /// \headerfile <x86intrin.h>
3870 ///
3871 /// This intrinsic does not correspond to a specific instruction.
3872 ///
3873 /// \param __p
3874 /// A pointer to a 16-bit memory location. The address of the memory
3875 /// location does not have to be aligned.
3876 /// \param __b
3877 /// A 128-bit integer vector containing the value to be stored.
3878 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3879  __m128i __b) {
3880  struct __storeu_si16 {
3881  short __v;
3882  } __attribute__((__packed__, __may_alias__));
3883  ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3884 }
3885 
3886 /// Moves bytes selected by the mask from the first operand to the
3887 /// specified unaligned memory location. When a mask bit is 1, the
3888 /// corresponding byte is written, otherwise it is not written.
3889 ///
3890 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3891 /// used again soon). Exception and trap behavior for elements not selected
3892 /// for storage to memory are implementation dependent.
3893 ///
3894 /// \headerfile <x86intrin.h>
3895 ///
3896 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3897 /// instruction.
3898 ///
3899 /// \param __d
3900 /// A 128-bit integer vector containing the values to be moved.
3901 /// \param __n
3902 /// A 128-bit integer vector containing the mask. The most significant bit of
3903 /// each byte represents the mask bits.
3904 /// \param __p
3905 /// A pointer to an unaligned 128-bit memory location where the specified
3906 /// values are moved.
3907 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3908  __m128i __n,
3909  char *__p) {
3910  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3911 }
3912 
3913 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914 /// a memory location.
3915 ///
3916 /// \headerfile <x86intrin.h>
3917 ///
3918 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3919 ///
3920 /// \param __p
3921 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3922 /// of the integer vector parameter.
3923 /// \param __a
3924 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925 /// value to be stored.
3926 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3927  __m128i __a) {
3928  struct __mm_storel_epi64_struct {
3929  long long __u;
3930  } __attribute__((__packed__, __may_alias__));
3931  ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3932 }
3933 
3934 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935 /// aligned memory location.
3936 ///
3937 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3938 /// used again soon).
3939 ///
3940 /// \headerfile <x86intrin.h>
3941 ///
3942 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3943 ///
3944 /// \param __p
3945 /// A pointer to the 128-bit aligned memory location used to store the value.
3946 /// \param __a
3947 /// A vector of [2 x double] containing the 64-bit values to be stored.
3948 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
3949  __m128d __a) {
3950  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3951 }
3952 
3953 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3954 ///
3955 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3956 /// used again soon).
3957 ///
3958 /// \headerfile <x86intrin.h>
3959 ///
3960 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3961 ///
3962 /// \param __p
3963 /// A pointer to the 128-bit aligned memory location used to store the value.
3964 /// \param __a
3965 /// A 128-bit integer vector containing the values to be stored.
3966 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
3967  __m128i __a) {
3968  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3969 }
3970 
3971 /// Stores a 32-bit integer value in the specified memory location.
3972 ///
3973 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3974 /// used again soon).
3975 ///
3976 /// \headerfile <x86intrin.h>
3977 ///
3978 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3979 ///
3980 /// \param __p
3981 /// A pointer to the 32-bit memory location used to store the value.
3982 /// \param __a
3983 /// A 32-bit integer containing the value to be stored.
3984 static __inline__ void
3985  __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3986  _mm_stream_si32(int *__p, int __a) {
3987  __builtin_ia32_movnti(__p, __a);
3988 }
3989 
3990 #ifdef __x86_64__
3991 /// Stores a 64-bit integer value in the specified memory location.
3992 ///
3993 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3994 /// used again soon).
3995 ///
3996 /// \headerfile <x86intrin.h>
3997 ///
3998 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3999 ///
4000 /// \param __p
4001 /// A pointer to the 64-bit memory location used to store the value.
4002 /// \param __a
4003 /// A 64-bit integer containing the value to be stored.
4004 static __inline__ void
4005  __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4006  _mm_stream_si64(long long *__p, long long __a) {
4007  __builtin_ia32_movnti64(__p, __a);
4008 }
4009 #endif
4010 
4011 #if defined(__cplusplus)
4012 extern "C" {
4013 #endif
4014 
4015 /// The cache line containing \a __p is flushed and invalidated from all
4016 /// caches in the coherency domain.
4017 ///
4018 /// \headerfile <x86intrin.h>
4019 ///
4020 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4021 ///
4022 /// \param __p
4023 /// A pointer to the memory location used to identify the cache line to be
4024 /// flushed.
4025 void _mm_clflush(void const *__p);
4026 
4027 /// Forces strong memory ordering (serialization) between load
4028 /// instructions preceding this instruction and load instructions following
4029 /// this instruction, ensuring the system completes all previous loads before
4030 /// executing subsequent loads.
4031 ///
4032 /// \headerfile <x86intrin.h>
4033 ///
4034 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4035 ///
4036 void _mm_lfence(void);
4037 
4038 /// Forces strong memory ordering (serialization) between load and store
4039 /// instructions preceding this instruction and load and store instructions
4040 /// following this instruction, ensuring that the system completes all
4041 /// previous memory accesses before executing subsequent memory accesses.
4042 ///
4043 /// \headerfile <x86intrin.h>
4044 ///
4045 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4046 ///
4047 void _mm_mfence(void);
4048 
4049 #if defined(__cplusplus)
4050 } // extern "C"
4051 #endif
4052 
4053 /// Converts 16-bit signed integers from both 128-bit integer vector
4054 /// operands into 8-bit signed integers, and packs the results into the
4055 /// destination. Positive values greater than 0x7F are saturated to 0x7F.
4056 /// Negative values less than 0x80 are saturated to 0x80.
4057 ///
4058 /// \headerfile <x86intrin.h>
4059 ///
4060 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4061 ///
4062 /// \param __a
4063 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064 /// a signed integer and is converted to a 8-bit signed integer with
4065 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067 /// written to the lower 64 bits of the result.
4068 /// \param __b
4069 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070 /// a signed integer and is converted to a 8-bit signed integer with
4071 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073 /// written to the higher 64 bits of the result.
4074 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4076  __m128i __b) {
4077  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4078 }
4079 
4080 /// Converts 32-bit signed integers from both 128-bit integer vector
4081 /// operands into 16-bit signed integers, and packs the results into the
4082 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083 /// Negative values less than 0x8000 are saturated to 0x8000.
4084 ///
4085 /// \headerfile <x86intrin.h>
4086 ///
4087 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4088 ///
4089 /// \param __a
4090 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091 /// a signed integer and is converted to a 16-bit signed integer with
4092 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094 /// are written to the lower 64 bits of the result.
4095 /// \param __b
4096 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097 /// a signed integer and is converted to a 16-bit signed integer with
4098 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100 /// are written to the higher 64 bits of the result.
4101 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4103  __m128i __b) {
4104  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4105 }
4106 
4107 /// Converts 16-bit signed integers from both 128-bit integer vector
4108 /// operands into 8-bit unsigned integers, and packs the results into the
4109 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110 /// than 0x00 are saturated to 0x00.
4111 ///
4112 /// \headerfile <x86intrin.h>
4113 ///
4114 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4115 ///
4116 /// \param __a
4117 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118 /// a signed integer and is converted to an 8-bit unsigned integer with
4119 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121 /// written to the lower 64 bits of the result.
4122 /// \param __b
4123 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124 /// a signed integer and is converted to an 8-bit unsigned integer with
4125 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127 /// written to the higher 64 bits of the result.
4128 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4130  __m128i __b) {
4131  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4132 }
4133 
4134 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135 /// the immediate-value parameter as a selector.
4136 ///
4137 /// \headerfile <x86intrin.h>
4138 ///
4139 /// \code
4140 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4141 /// \endcode
4142 ///
4143 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4144 ///
4145 /// \param a
4146 /// A 128-bit integer vector.
4147 /// \param imm
4148 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4149 /// to bits[15:0] of the result. \n
4150 /// 000: assign values from bits [15:0] of \a a. \n
4151 /// 001: assign values from bits [31:16] of \a a. \n
4152 /// 010: assign values from bits [47:32] of \a a. \n
4153 /// 011: assign values from bits [63:48] of \a a. \n
4154 /// 100: assign values from bits [79:64] of \a a. \n
4155 /// 101: assign values from bits [95:80] of \a a. \n
4156 /// 110: assign values from bits [111:96] of \a a. \n
4157 /// 111: assign values from bits [127:112] of \a a.
4158 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159 /// integer vector parameter and the remaining bits are assigned zeros.
4160 #define _mm_extract_epi16(a, imm) \
4161  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4162  (int)(imm)))
4163 
4164 /// Constructs a 128-bit integer vector by first making a copy of the
4165 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4166 /// of an integer parameter into an offset specified by the immediate-value
4167 /// parameter.
4168 ///
4169 /// \headerfile <x86intrin.h>
4170 ///
4171 /// \code
4172 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4173 /// \endcode
4174 ///
4175 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4176 ///
4177 /// \param a
4178 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179 /// result and then one of the eight elements in the result is replaced by
4180 /// the lower 16 bits of \a b.
4181 /// \param b
4182 /// An integer. The lower 16 bits of this parameter are written to the
4183 /// result beginning at an offset specified by \a imm.
4184 /// \param imm
4185 /// An immediate value specifying the bit offset in the result at which the
4186 /// lower 16 bits of \a b are written.
4187 /// \returns A 128-bit integer vector containing the constructed values.
4188 #define _mm_insert_epi16(a, b, imm) \
4189  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4190  (int)(imm)))
4191 
4192 /// Copies the values of the most significant bits from each 8-bit
4193 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194 /// value, zero-extends the value, and writes it to the destination.
4195 ///
4196 /// \headerfile <x86intrin.h>
4197 ///
4198 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4199 ///
4200 /// \param __a
4201 /// A 128-bit integer vector containing the values with bits to be extracted.
4202 /// \returns The most significant bits from each 8-bit element in \a __a,
4203 /// written to bits [15:0]. The other bits are assigned zeros.
4204 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4205  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4206 }
4207 
4208 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4209 /// elements of a 128-bit integer vector parameter, using the immediate-value
4210 /// parameter as a specifier.
4211 ///
4212 /// \headerfile <x86intrin.h>
4213 ///
4214 /// \code
4215 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4216 /// \endcode
4217 ///
4218 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4219 ///
4220 /// \param a
4221 /// A 128-bit integer vector containing the values to be copied.
4222 /// \param imm
4223 /// An immediate value containing an 8-bit value specifying which elements to
4224 /// copy from a. The destinations within the 128-bit destination are assigned
4225 /// values as follows: \n
4226 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230 /// Bit value assignments: \n
4231 /// 00: assign values from bits [31:0] of \a a. \n
4232 /// 01: assign values from bits [63:32] of \a a. \n
4233 /// 10: assign values from bits [95:64] of \a a. \n
4234 /// 11: assign values from bits [127:96] of \a a. \n
4235 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237 /// <c>[b6, b4, b2, b0]</c>.
4238 /// \returns A 128-bit integer vector containing the shuffled values.
4239 #define _mm_shuffle_epi32(a, imm) \
4240  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4241 
4242 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4244 /// value parameter as a specifier.
4245 ///
4246 /// \headerfile <x86intrin.h>
4247 ///
4248 /// \code
4249 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4250 /// \endcode
4251 ///
4252 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4253 ///
4254 /// \param a
4255 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256 /// [127:64] of the result.
4257 /// \param imm
4258 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4259 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263 /// Bit value assignments: \n
4264 /// 00: assign values from bits [15:0] of \a a. \n
4265 /// 01: assign values from bits [31:16] of \a a. \n
4266 /// 10: assign values from bits [47:32] of \a a. \n
4267 /// 11: assign values from bits [63:48] of \a a. \n
4268 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270 /// <c>[b6, b4, b2, b0]</c>.
4271 /// \returns A 128-bit integer vector containing the shuffled values.
4272 #define _mm_shufflelo_epi16(a, imm) \
4273  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4274 
4275 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4277 /// value parameter as a specifier.
4278 ///
4279 /// \headerfile <x86intrin.h>
4280 ///
4281 /// \code
4282 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4283 /// \endcode
4284 ///
4285 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4286 ///
4287 /// \param a
4288 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289 /// [63:0] of the result.
4290 /// \param imm
4291 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4292 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296 /// Bit value assignments: \n
4297 /// 00: assign values from bits [79:64] of \a a. \n
4298 /// 01: assign values from bits [95:80] of \a a. \n
4299 /// 10: assign values from bits [111:96] of \a a. \n
4300 /// 11: assign values from bits [127:112] of \a a. \n
4301 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303 /// <c>[b6, b4, b2, b0]</c>.
4304 /// \returns A 128-bit integer vector containing the shuffled values.
4305 #define _mm_shufflehi_epi16(a, imm) \
4306  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4307 
4308 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4310 ///
4311 /// \headerfile <x86intrin.h>
4312 ///
4313 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4314 /// instruction.
4315 ///
4316 /// \param __a
4317 /// A 128-bit vector of [16 x i8].
4318 /// Bits [71:64] are written to bits [7:0] of the result. \n
4319 /// Bits [79:72] are written to bits [23:16] of the result. \n
4320 /// Bits [87:80] are written to bits [39:32] of the result. \n
4321 /// Bits [95:88] are written to bits [55:48] of the result. \n
4322 /// Bits [103:96] are written to bits [71:64] of the result. \n
4323 /// Bits [111:104] are written to bits [87:80] of the result. \n
4324 /// Bits [119:112] are written to bits [103:96] of the result. \n
4325 /// Bits [127:120] are written to bits [119:112] of the result.
4326 /// \param __b
4327 /// A 128-bit vector of [16 x i8]. \n
4328 /// Bits [71:64] are written to bits [15:8] of the result. \n
4329 /// Bits [79:72] are written to bits [31:24] of the result. \n
4330 /// Bits [87:80] are written to bits [47:40] of the result. \n
4331 /// Bits [95:88] are written to bits [63:56] of the result. \n
4332 /// Bits [103:96] are written to bits [79:72] of the result. \n
4333 /// Bits [111:104] are written to bits [95:88] of the result. \n
4334 /// Bits [119:112] are written to bits [111:104] of the result. \n
4335 /// Bits [127:120] are written to bits [127:120] of the result.
4336 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4338  __m128i __b) {
4339  return (__m128i)__builtin_shufflevector(
4340  (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341  16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4342 }
4343 
4344 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4346 ///
4347 /// \headerfile <x86intrin.h>
4348 ///
4349 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4350 /// instruction.
4351 ///
4352 /// \param __a
4353 /// A 128-bit vector of [8 x i16].
4354 /// Bits [79:64] are written to bits [15:0] of the result. \n
4355 /// Bits [95:80] are written to bits [47:32] of the result. \n
4356 /// Bits [111:96] are written to bits [79:64] of the result. \n
4357 /// Bits [127:112] are written to bits [111:96] of the result.
4358 /// \param __b
4359 /// A 128-bit vector of [8 x i16].
4360 /// Bits [79:64] are written to bits [31:16] of the result. \n
4361 /// Bits [95:80] are written to bits [63:48] of the result. \n
4362 /// Bits [111:96] are written to bits [95:80] of the result. \n
4363 /// Bits [127:112] are written to bits [127:112] of the result.
4364 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4366  __m128i __b) {
4367  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4368  8 + 5, 6, 8 + 6, 7, 8 + 7);
4369 }
4370 
4371 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4373 ///
4374 /// \headerfile <x86intrin.h>
4375 ///
4376 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4377 /// instruction.
4378 ///
4379 /// \param __a
4380 /// A 128-bit vector of [4 x i32]. \n
4381 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4382 /// Bits [127:96] are written to bits [95:64] of the destination.
4383 /// \param __b
4384 /// A 128-bit vector of [4 x i32]. \n
4385 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4386 /// Bits [127:96] are written to bits [127:96] of the destination.
4387 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4389  __m128i __b) {
4390  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4391  4 + 3);
4392 }
4393 
4394 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4396 ///
4397 /// \headerfile <x86intrin.h>
4398 ///
4399 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4400 /// instruction.
4401 ///
4402 /// \param __a
4403 /// A 128-bit vector of [2 x i64]. \n
4404 /// Bits [127:64] are written to bits [63:0] of the destination.
4405 /// \param __b
4406 /// A 128-bit vector of [2 x i64]. \n
4407 /// Bits [127:64] are written to bits [127:64] of the destination.
4408 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4410  __m128i __b) {
4411  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4412 }
4413 
4414 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4416 ///
4417 /// \headerfile <x86intrin.h>
4418 ///
4419 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4420 /// instruction.
4421 ///
4422 /// \param __a
4423 /// A 128-bit vector of [16 x i8]. \n
4424 /// Bits [7:0] are written to bits [7:0] of the result. \n
4425 /// Bits [15:8] are written to bits [23:16] of the result. \n
4426 /// Bits [23:16] are written to bits [39:32] of the result. \n
4427 /// Bits [31:24] are written to bits [55:48] of the result. \n
4428 /// Bits [39:32] are written to bits [71:64] of the result. \n
4429 /// Bits [47:40] are written to bits [87:80] of the result. \n
4430 /// Bits [55:48] are written to bits [103:96] of the result. \n
4431 /// Bits [63:56] are written to bits [119:112] of the result.
4432 /// \param __b
4433 /// A 128-bit vector of [16 x i8].
4434 /// Bits [7:0] are written to bits [15:8] of the result. \n
4435 /// Bits [15:8] are written to bits [31:24] of the result. \n
4436 /// Bits [23:16] are written to bits [47:40] of the result. \n
4437 /// Bits [31:24] are written to bits [63:56] of the result. \n
4438 /// Bits [39:32] are written to bits [79:72] of the result. \n
4439 /// Bits [47:40] are written to bits [95:88] of the result. \n
4440 /// Bits [55:48] are written to bits [111:104] of the result. \n
4441 /// Bits [63:56] are written to bits [127:120] of the result.
4442 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4444  __m128i __b) {
4445  return (__m128i)__builtin_shufflevector(
4446  (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447  16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4448 }
4449 
4450 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4452 /// [8 x i16].
4453 ///
4454 /// \headerfile <x86intrin.h>
4455 ///
4456 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4457 /// instruction.
4458 ///
4459 /// \param __a
4460 /// A 128-bit vector of [8 x i16].
4461 /// Bits [15:0] are written to bits [15:0] of the result. \n
4462 /// Bits [31:16] are written to bits [47:32] of the result. \n
4463 /// Bits [47:32] are written to bits [79:64] of the result. \n
4464 /// Bits [63:48] are written to bits [111:96] of the result.
4465 /// \param __b
4466 /// A 128-bit vector of [8 x i16].
4467 /// Bits [15:0] are written to bits [31:16] of the result. \n
4468 /// Bits [31:16] are written to bits [63:48] of the result. \n
4469 /// Bits [47:32] are written to bits [95:80] of the result. \n
4470 /// Bits [63:48] are written to bits [127:112] of the result.
4471 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4473  __m128i __b) {
4474  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4475  8 + 1, 2, 8 + 2, 3, 8 + 3);
4476 }
4477 
4478 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4480 ///
4481 /// \headerfile <x86intrin.h>
4482 ///
4483 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4484 /// instruction.
4485 ///
4486 /// \param __a
4487 /// A 128-bit vector of [4 x i32]. \n
4488 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4489 /// Bits [63:32] are written to bits [95:64] of the destination.
4490 /// \param __b
4491 /// A 128-bit vector of [4 x i32]. \n
4492 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4493 /// Bits [63:32] are written to bits [127:96] of the destination.
4494 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4496  __m128i __b) {
4497  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4498  4 + 1);
4499 }
4500 
4501 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4503 ///
4504 /// \headerfile <x86intrin.h>
4505 ///
4506 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4507 /// instruction.
4508 ///
4509 /// \param __a
4510 /// A 128-bit vector of [2 x i64]. \n
4511 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4512 /// \param __b
4513 /// A 128-bit vector of [2 x i64]. \n
4514 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4515 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4517  __m128i __b) {
4518  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4519 }
4520 
4521 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4522 /// integer.
4523 ///
4524 /// \headerfile <x86intrin.h>
4525 ///
4526 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4527 ///
4528 /// \param __a
4529 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4530 /// destination.
4531 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4533  return (__m64)__a[0];
4534 }
4535 
4536 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4537 /// upper bits.
4538 ///
4539 /// \headerfile <x86intrin.h>
4540 ///
4541 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4542 ///
4543 /// \param __a
4544 /// A 64-bit value.
4545 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546 /// the operand. The upper 64 bits are assigned zeros.
4547 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4548  return __extension__(__m128i)(__v2di){(long long)__a, 0};
4549 }
4550 
4551 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552 /// integer vector, zeroing the upper bits.
4553 ///
4554 /// \headerfile <x86intrin.h>
4555 ///
4556 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4557 ///
4558 /// \param __a
4559 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4560 /// destination.
4561 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562 /// the operand. The upper 64 bits are assigned zeros.
4563 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4564  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4565 }
4566 
4567 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4569 /// double].
4570 ///
4571 /// \headerfile <x86intrin.h>
4572 ///
4573 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4574 ///
4575 /// \param __a
4576 /// A 128-bit vector of [2 x double]. \n
4577 /// Bits [127:64] are written to bits [63:0] of the destination.
4578 /// \param __b
4579 /// A 128-bit vector of [2 x double]. \n
4580 /// Bits [127:64] are written to bits [127:64] of the destination.
4581 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4583  __m128d __b) {
4584  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4585 }
4586 
4587 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4589 /// double].
4590 ///
4591 /// \headerfile <x86intrin.h>
4592 ///
4593 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4594 ///
4595 /// \param __a
4596 /// A 128-bit vector of [2 x double]. \n
4597 /// Bits [63:0] are written to bits [63:0] of the destination.
4598 /// \param __b
4599 /// A 128-bit vector of [2 x double]. \n
4600 /// Bits [63:0] are written to bits [127:64] of the destination.
4601 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4603  __m128d __b) {
4604  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4605 }
4606 
4607 /// Extracts the sign bits of the double-precision values in the 128-bit
4608 /// vector of [2 x double], zero-extends the value, and writes it to the
4609 /// low-order bits of the destination.
4610 ///
4611 /// \headerfile <x86intrin.h>
4612 ///
4613 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4614 ///
4615 /// \param __a
4616 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4617 /// be extracted.
4618 /// \returns The sign bits from each of the double-precision elements in \a __a,
4619 /// written to bits [1:0]. The remaining bits are assigned values of zero.
4620 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4621  return __builtin_ia32_movmskpd((__v2df)__a);
4622 }
4623 
4624 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4625 /// 128-bit vector parameters of [2 x double], using the immediate-value
4626 /// parameter as a specifier.
4627 ///
4628 /// \headerfile <x86intrin.h>
4629 ///
4630 /// \code
4631 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632 /// \endcode
4633 ///
4634 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4635 ///
4636 /// \param a
4637 /// A 128-bit vector of [2 x double].
4638 /// \param b
4639 /// A 128-bit vector of [2 x double].
4640 /// \param i
4641 /// An 8-bit immediate value. The least significant two bits specify which
4642 /// elements to copy from \a a and \a b: \n
4643 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649 /// <c>[b1, b0]</c>.
4650 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651 #define _mm_shuffle_pd(a, b, i) \
4652  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4653  (int)(i)))
4654 
4655 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656 /// floating-point vector of [4 x float].
4657 ///
4658 /// \headerfile <x86intrin.h>
4659 ///
4660 /// This intrinsic has no corresponding instruction.
4661 ///
4662 /// \param __a
4663 /// A 128-bit floating-point vector of [2 x double].
4664 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665 /// bitwise pattern as the parameter.
4666 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4667  return (__m128)__a;
4668 }
4669 
4670 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671 /// integer vector.
4672 ///
4673 /// \headerfile <x86intrin.h>
4674 ///
4675 /// This intrinsic has no corresponding instruction.
4676 ///
4677 /// \param __a
4678 /// A 128-bit floating-point vector of [2 x double].
4679 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680 /// parameter.
4681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4682  return (__m128i)__a;
4683 }
4684 
4685 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686 /// floating-point vector of [2 x double].
4687 ///
4688 /// \headerfile <x86intrin.h>
4689 ///
4690 /// This intrinsic has no corresponding instruction.
4691 ///
4692 /// \param __a
4693 /// A 128-bit floating-point vector of [4 x float].
4694 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695 /// bitwise pattern as the parameter.
4696 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4697  return (__m128d)__a;
4698 }
4699 
4700 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4701 /// integer vector.
4702 ///
4703 /// \headerfile <x86intrin.h>
4704 ///
4705 /// This intrinsic has no corresponding instruction.
4706 ///
4707 /// \param __a
4708 /// A 128-bit floating-point vector of [4 x float].
4709 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4710 /// parameter.
4711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4712  return (__m128i)__a;
4713 }
4714 
4715 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4716 /// of [4 x float].
4717 ///
4718 /// \headerfile <x86intrin.h>
4719 ///
4720 /// This intrinsic has no corresponding instruction.
4721 ///
4722 /// \param __a
4723 /// A 128-bit integer vector.
4724 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725 /// bitwise pattern as the parameter.
4726 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4727  return (__m128)__a;
4728 }
4729 
4730 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731 /// of [2 x double].
4732 ///
4733 /// \headerfile <x86intrin.h>
4734 ///
4735 /// This intrinsic has no corresponding instruction.
4736 ///
4737 /// \param __a
4738 /// A 128-bit integer vector.
4739 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740 /// bitwise pattern as the parameter.
4741 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4742  return (__m128d)__a;
4743 }
4744 
4745 #if defined(__cplusplus)
4746 extern "C" {
4747 #endif
4748 
4749 /// Indicates that a spin loop is being executed for the purposes of
4750 /// optimizing power consumption during the loop.
4751 ///
4752 /// \headerfile <x86intrin.h>
4753 ///
4754 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4755 ///
4756 void _mm_pause(void);
4757 
4758 #if defined(__cplusplus)
4759 } // extern "C"
4760 #endif
4761 #undef __DEFAULT_FN_ATTRS
4762 #undef __DEFAULT_FN_ATTRS_MMX
4763 
4764 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4765 
4766 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4767 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4768 
4769 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4770 
4771 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4773  (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4774 
4775 #endif /* __EMMINTRIN_H */
_mm_xor_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:407
_mm_set_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1787
_mm_load1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1533
_mm_cvtepi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1297
_mm_cvtpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1454
_mm_undefined_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1719
_mm_set_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3474
_mm_mul_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2409
__x
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:22
_mm_srli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2987
_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3023
_mm_cmpnle_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:894
_mm_add_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2037
_mm_loadl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3389
_mm_madd_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2240
_mm_unpacklo_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4602
_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
_mm_set_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3426
_mm_add_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2058
_mm_storeu_si32
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, __m128i __b)
Stores a 32-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3858
_mm_loadu_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3369
_mm_sqrt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236
_mm_cmpgt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:743
_mm_cmpgt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3120
_mm_subs_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2594
_mm_unpackhi_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4582
__v
struct __storeu_i16 *__P __v
Definition: immintrin.h:401
_mm_comige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1067
_mm_sub_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114
_mm_srai_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2853
_mm_ucomigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1190
_mm_cmpunord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:822
_mm_storeu_si16
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, __m128i __b)
Stores a 16-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3878
_mm_div_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193
_mm_and_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2611
_mm_add_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2096
_mm_set_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3563
_mm_cvtsd_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1353
_mm_unpacklo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4495
_mm_cmpeq_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3041
_mm_set_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1737
_mm_max_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:336
_mm_subs_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2536
_mm_setr_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3671
_mm_sll_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2796
__a
static __inline__ void int __a
Definition: emmintrin.h:3986
_mm_ucomineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1240
_mm_ucomile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1165
_mm_castpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4666
_mm_mul_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171
_mm_sub_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2516
_mm_set1_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3584
_mm_storel_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1992
__DEFAULT_FN_ATTRS_MMX
#define __DEFAULT_FN_ATTRS_MMX
Definition: emmintrin.h:55
_mm_loadu_si16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a)
Loads a 16-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1629
_mm_sra_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2872
_mm_storeu_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1934
_mm_unpackhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4365
_mm_load_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1648
_mm_undefined_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3406
_mm_store_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1878
_mm_packus_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4129
xmmintrin.h
_mm_srl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3005
_mm_setr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1807
_mm_cmpneq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:846
_mm_comineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092
_mm_cvtsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1330
__DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS
Definition: emmintrin.h:52
_mm_cmpgt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:484
_mm_max_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2278
_mm_cvttps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3279
_mm_maskmoveu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3907
_mm_cmpeq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:425
_mm_cmple_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:464
_mm_loadh_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1673
_mm_cmpge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:768
_mm_sqrt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253
_mm_cvtps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3264
_mm_movepi64_pi64
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4532
_mm_min_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2297
_mm_xor_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2663
_mm_andnot_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:373
_mm_set1_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3601
_mm_cmpnge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:943
_mm_unpackhi_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4409
_mm_cvtps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1276
_mm_cvtsi32_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1375
_mm_add_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74
_mm_setr_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3772
_mm_cmpeq_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3059
_mm_cmplt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:695
_mm_unpacklo_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4443
_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3652
_mm_cmple_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:719
_mm_cvttpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1423
_mm_mulhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2335
_mm_comile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1017
_mm_loadu_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1573
_mm_sra_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2834
_mm_cvtpi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1487
_mm_clflush
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
_mm_cmplt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3140
_mm_move_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4563
_mm_mulhi_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2354
_mm_cmpnle_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:609
_mm_avg_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2215
_mm_load_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3354
_mm_mullo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2373
_mm_cvtsi128_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3323
_mm_cmpeq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:671
_mm_move_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1840
_mm_sub_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2481
_mm_srl_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2933
_mm_cmpnlt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:870
_mm_cvtsi64_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3308
_mm_unpackhi_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4388
_mm_sub_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2499
_mm_sub_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2464
_mm_adds_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2177
_mm_slli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2778
_mm_sll_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2760
_mm_set_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3514
__attribute__
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
_mm_castsi128_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4726
_mm_storeh_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1973
_mm_srli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2951
_mm_unpacklo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4472
_mm_packs_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4102
_mm_cvtepi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3250
_mm_comieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:967
_mm_storeu_si64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, __m128i __b)
Stores a 64-bit integer value from the low element of a 128-bit integer vector.
Definition: emmintrin.h:3838
_mm_slli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2706
_mm_ucomieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1115
_mm_store_pd1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1917
_mm_setzero_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3787
_mm_castps_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4711
_mm_adds_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2157
_mm_div_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
_mm_or_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:390
_mm_andnot_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2630
_mm_unpacklo_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4516
_mm_cmpunord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:549
_mm_cvtpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1315
_mm_subs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2556
_mm_storeu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3818
_mm_mul_su32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2391
_mm_stream_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:3948
_mm_set1_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3635
_mm_loadl_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1699
_mm_slli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2742
_mm_cmpge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:504
_mm_sll_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2724
_Float16
__device__ _Float16
Definition: __clang_hip_libdevice_declares.h:298
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
_mm_cvttpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1472
_mm_min_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:294
_mm_packs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition: emmintrin.h:4075
_mm_sad_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2430
_mm_comigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1042
_mm_comilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:992
_mm_storer_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1956
_mm_setr_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3725
_mm_cvtss_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1400
_mm_srai_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2815
_mm_cmplt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:444
_mm_castps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4696
_mm_movemask_pd
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4620
_mm_min_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:275
_mm_sub_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132
_mm_cmpneq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:569
_mm_movpi64_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4547
_mm_cmpord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:526
_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4204
_mm_set1_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3618
_mm_set1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1753
_mm_store1_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1897
_mm_add_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92
_mm_cvtsi128_si64
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3339
_mm_avg_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2196
_mm_or_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2646
_mm_adds_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2116
__b
static __inline__ vector float vector float __b
Definition: altivec.h:578
_mm_mfence
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
_mm_loadu_si64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a)
Loads a 64-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1591
_mm_cmpnlt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:589
_mm_cmplt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3180
_mm_store_sd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1857
_mm_lfence
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
_mm_add_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2016
_mm_loadr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1557
_mm_cmpgt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3100
_mm_loadu_si32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a)
Loads a 32-bit integer value to the low element of a 128-bit integer vector and clears the upper elem...
Definition: emmintrin.h:1610
_mm_sub_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2447
_mm_cmpgt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3078
_mm_unpackhi_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4337
_mm_max_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:317
_mm_cvtpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1258
_mm_ucomilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1140
_mm_store_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3803
_mm_cvtsd_f64
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1502
__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4788
_mm_castpd_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4681
_mm_stream_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:3966
_mm_min_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2316
_mm_cmpnge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:649
_mm_subs_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2575
_mm_and_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:353
_mm_mul_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
_mm_adds_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2137
_mm_cmplt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3160
_mm_storel_epi64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3926
_mm_set_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3447
_mm_ucomige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1215
_mm_cvttsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1439
_mm_cvtsi32_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3293
_mm_cmpngt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:629
_mm_set_pd1
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1769
_mm_castsi128_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4741
_mm_setzero_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1821
_mm_add_si64
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2075
_mm_srli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2915
_mm_srl_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2969
_mm_max_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2259
_mm_setr_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3693
_mm_cmpngt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:918
_mm_cmpord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:795
_mm_load_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1517