clang  9.0.0svn
xmmintrin.h
Go to the documentation of this file.
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #include <mmintrin.h>
14 
15 typedef int __v4si __attribute__((__vector_size__(16)));
16 typedef float __v4sf __attribute__((__vector_size__(16)));
17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
18 
19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
20 
21 /* Unsigned types */
22 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
23 
24 /* This header should only be included in a hosted environment as it depends on
25  * a standard library to provide allocation routines. */
26 #if __STDC_HOSTED__
27 #include <mm_malloc.h>
28 #endif
29 
30 /* Define the default attributes for the functions in this file. */
31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
33 
34 /// Adds the 32-bit float values in the low-order bits of the operands.
35 ///
36 /// \headerfile <x86intrin.h>
37 ///
38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
39 ///
40 /// \param __a
41 /// A 128-bit vector of [4 x float] containing one of the source operands.
42 /// The lower 32 bits of this operand are used in the calculation.
43 /// \param __b
44 /// A 128-bit vector of [4 x float] containing one of the source operands.
45 /// The lower 32 bits of this operand are used in the calculation.
46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
48 /// the upper 96 bits of the first source operand.
49 static __inline__ __m128 __DEFAULT_FN_ATTRS
50 _mm_add_ss(__m128 __a, __m128 __b)
51 {
52  __a[0] += __b[0];
53  return __a;
54 }
55 
56 /// Adds two 128-bit vectors of [4 x float], and returns the results of
57 /// the addition.
58 ///
59 /// \headerfile <x86intrin.h>
60 ///
61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
62 ///
63 /// \param __a
64 /// A 128-bit vector of [4 x float] containing one of the source operands.
65 /// \param __b
66 /// A 128-bit vector of [4 x float] containing one of the source operands.
67 /// \returns A 128-bit vector of [4 x float] containing the sums of both
68 /// operands.
69 static __inline__ __m128 __DEFAULT_FN_ATTRS
70 _mm_add_ps(__m128 __a, __m128 __b)
71 {
72  return (__m128)((__v4sf)__a + (__v4sf)__b);
73 }
74 
75 /// Subtracts the 32-bit float value in the low-order bits of the second
76 /// operand from the corresponding value in the first operand.
77 ///
78 /// \headerfile <x86intrin.h>
79 ///
80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
81 ///
82 /// \param __a
83 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84 /// of this operand are used in the calculation.
85 /// \param __b
86 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87 /// bits of this operand are used in the calculation.
88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89 /// difference of the lower 32 bits of both operands. The upper 96 bits are
90 /// copied from the upper 96 bits of the first source operand.
91 static __inline__ __m128 __DEFAULT_FN_ATTRS
92 _mm_sub_ss(__m128 __a, __m128 __b)
93 {
94  __a[0] -= __b[0];
95  return __a;
96 }
97 
98 /// Subtracts each of the values of the second operand from the first
99 /// operand, both of which are 128-bit vectors of [4 x float] and returns
100 /// the results of the subtraction.
101 ///
102 /// \headerfile <x86intrin.h>
103 ///
104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
105 ///
106 /// \param __a
107 /// A 128-bit vector of [4 x float] containing the minuend.
108 /// \param __b
109 /// A 128-bit vector of [4 x float] containing the subtrahend.
110 /// \returns A 128-bit vector of [4 x float] containing the differences between
111 /// both operands.
112 static __inline__ __m128 __DEFAULT_FN_ATTRS
113 _mm_sub_ps(__m128 __a, __m128 __b)
114 {
115  return (__m128)((__v4sf)__a - (__v4sf)__b);
116 }
117 
118 /// Multiplies two 32-bit float values in the low-order bits of the
119 /// operands.
120 ///
121 /// \headerfile <x86intrin.h>
122 ///
123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
124 ///
125 /// \param __a
126 /// A 128-bit vector of [4 x float] containing one of the source operands.
127 /// The lower 32 bits of this operand are used in the calculation.
128 /// \param __b
129 /// A 128-bit vector of [4 x float] containing one of the source operands.
130 /// The lower 32 bits of this operand are used in the calculation.
131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
132 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
133 /// bits of the first source operand.
134 static __inline__ __m128 __DEFAULT_FN_ATTRS
135 _mm_mul_ss(__m128 __a, __m128 __b)
136 {
137  __a[0] *= __b[0];
138  return __a;
139 }
140 
141 /// Multiplies two 128-bit vectors of [4 x float] and returns the
142 /// results of the multiplication.
143 ///
144 /// \headerfile <x86intrin.h>
145 ///
146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
147 ///
148 /// \param __a
149 /// A 128-bit vector of [4 x float] containing one of the source operands.
150 /// \param __b
151 /// A 128-bit vector of [4 x float] containing one of the source operands.
152 /// \returns A 128-bit vector of [4 x float] containing the products of both
153 /// operands.
154 static __inline__ __m128 __DEFAULT_FN_ATTRS
155 _mm_mul_ps(__m128 __a, __m128 __b)
156 {
157  return (__m128)((__v4sf)__a * (__v4sf)__b);
158 }
159 
160 /// Divides the value in the low-order 32 bits of the first operand by
161 /// the corresponding value in the second operand.
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
166 ///
167 /// \param __a
168 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
169 /// bits of this operand are used in the calculation.
170 /// \param __b
171 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172 /// of this operand are used in the calculation.
173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
174 /// lower 32 bits of both operands. The upper 96 bits are copied from the
175 /// upper 96 bits of the first source operand.
176 static __inline__ __m128 __DEFAULT_FN_ATTRS
177 _mm_div_ss(__m128 __a, __m128 __b)
178 {
179  __a[0] /= __b[0];
180  return __a;
181 }
182 
183 /// Divides two 128-bit vectors of [4 x float].
184 ///
185 /// \headerfile <x86intrin.h>
186 ///
187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
188 ///
189 /// \param __a
190 /// A 128-bit vector of [4 x float] containing the dividend.
191 /// \param __b
192 /// A 128-bit vector of [4 x float] containing the divisor.
193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
194 /// operands.
195 static __inline__ __m128 __DEFAULT_FN_ATTRS
196 _mm_div_ps(__m128 __a, __m128 __b)
197 {
198  return (__m128)((__v4sf)__a / (__v4sf)__b);
199 }
200 
201 /// Calculates the square root of the value stored in the low-order bits
202 /// of a 128-bit vector of [4 x float].
203 ///
204 /// \headerfile <x86intrin.h>
205 ///
206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
207 ///
208 /// \param __a
209 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210 /// used in the calculation.
211 /// \returns A 128-bit vector of [4 x float] containing the square root of the
212 /// value in the low-order bits of the operand.
213 static __inline__ __m128 __DEFAULT_FN_ATTRS
215 {
216  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
217 }
218 
219 /// Calculates the square roots of the values stored in a 128-bit vector
220 /// of [4 x float].
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [4 x float].
228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
229 /// values in the operand.
230 static __inline__ __m128 __DEFAULT_FN_ATTRS
232 {
233  return __builtin_ia32_sqrtps((__v4sf)__a);
234 }
235 
236 /// Calculates the approximate reciprocal of the value stored in the
237 /// low-order bits of a 128-bit vector of [4 x float].
238 ///
239 /// \headerfile <x86intrin.h>
240 ///
241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
242 ///
243 /// \param __a
244 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245 /// used in the calculation.
246 /// \returns A 128-bit vector of [4 x float] containing the approximate
247 /// reciprocal of the value in the low-order bits of the operand.
248 static __inline__ __m128 __DEFAULT_FN_ATTRS
250 {
251  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
252 }
253 
254 /// Calculates the approximate reciprocals of the values stored in a
255 /// 128-bit vector of [4 x float].
256 ///
257 /// \headerfile <x86intrin.h>
258 ///
259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
260 ///
261 /// \param __a
262 /// A 128-bit vector of [4 x float].
263 /// \returns A 128-bit vector of [4 x float] containing the approximate
264 /// reciprocals of the values in the operand.
265 static __inline__ __m128 __DEFAULT_FN_ATTRS
267 {
268  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
269 }
270 
271 /// Calculates the approximate reciprocal of the square root of the value
272 /// stored in the low-order bits of a 128-bit vector of [4 x float].
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
277 ///
278 /// \param __a
279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280 /// used in the calculation.
281 /// \returns A 128-bit vector of [4 x float] containing the approximate
282 /// reciprocal of the square root of the value in the low-order bits of the
283 /// operand.
284 static __inline__ __m128 __DEFAULT_FN_ATTRS
286 {
287  return __builtin_ia32_rsqrtss((__v4sf)__a);
288 }
289 
290 /// Calculates the approximate reciprocals of the square roots of the
291 /// values stored in a 128-bit vector of [4 x float].
292 ///
293 /// \headerfile <x86intrin.h>
294 ///
295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
296 ///
297 /// \param __a
298 /// A 128-bit vector of [4 x float].
299 /// \returns A 128-bit vector of [4 x float] containing the approximate
300 /// reciprocals of the square roots of the values in the operand.
301 static __inline__ __m128 __DEFAULT_FN_ATTRS
303 {
304  return __builtin_ia32_rsqrtps((__v4sf)__a);
305 }
306 
307 /// Compares two 32-bit float values in the low-order bits of both
308 /// operands and returns the lesser value in the low-order bits of the
309 /// vector of [4 x float].
310 ///
311 /// \headerfile <x86intrin.h>
312 ///
313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
314 ///
315 /// \param __a
316 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
317 /// 32 bits of this operand are used in the comparison.
318 /// \param __b
319 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
320 /// 32 bits of this operand are used in the comparison.
321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322 /// minimum value between both operands. The upper 96 bits are copied from
323 /// the upper 96 bits of the first source operand.
324 static __inline__ __m128 __DEFAULT_FN_ATTRS
325 _mm_min_ss(__m128 __a, __m128 __b)
326 {
327  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
328 }
329 
330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
331 /// of each pair of values.
332 ///
333 /// \headerfile <x86intrin.h>
334 ///
335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
336 ///
337 /// \param __a
338 /// A 128-bit vector of [4 x float] containing one of the operands.
339 /// \param __b
340 /// A 128-bit vector of [4 x float] containing one of the operands.
341 /// \returns A 128-bit vector of [4 x float] containing the minimum values
342 /// between both operands.
343 static __inline__ __m128 __DEFAULT_FN_ATTRS
344 _mm_min_ps(__m128 __a, __m128 __b)
345 {
346  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
347 }
348 
349 /// Compares two 32-bit float values in the low-order bits of both
350 /// operands and returns the greater value in the low-order bits of a 128-bit
351 /// vector of [4 x float].
352 ///
353 /// \headerfile <x86intrin.h>
354 ///
355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
356 ///
357 /// \param __a
358 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
359 /// 32 bits of this operand are used in the comparison.
360 /// \param __b
361 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
362 /// 32 bits of this operand are used in the comparison.
363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364 /// maximum value between both operands. The upper 96 bits are copied from
365 /// the upper 96 bits of the first source operand.
366 static __inline__ __m128 __DEFAULT_FN_ATTRS
367 _mm_max_ss(__m128 __a, __m128 __b)
368 {
369  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
370 }
371 
372 /// Compares two 128-bit vectors of [4 x float] and returns the greater
373 /// of each pair of values.
374 ///
375 /// \headerfile <x86intrin.h>
376 ///
377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
378 ///
379 /// \param __a
380 /// A 128-bit vector of [4 x float] containing one of the operands.
381 /// \param __b
382 /// A 128-bit vector of [4 x float] containing one of the operands.
383 /// \returns A 128-bit vector of [4 x float] containing the maximum values
384 /// between both operands.
385 static __inline__ __m128 __DEFAULT_FN_ATTRS
386 _mm_max_ps(__m128 __a, __m128 __b)
387 {
388  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
389 }
390 
391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
392 ///
393 /// \headerfile <x86intrin.h>
394 ///
395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
396 ///
397 /// \param __a
398 /// A 128-bit vector containing one of the source operands.
399 /// \param __b
400 /// A 128-bit vector containing one of the source operands.
401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402 /// values between both operands.
403 static __inline__ __m128 __DEFAULT_FN_ATTRS
404 _mm_and_ps(__m128 __a, __m128 __b)
405 {
406  return (__m128)((__v4su)__a & (__v4su)__b);
407 }
408 
409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
410 /// the one's complement of the values contained in the first source
411 /// operand.
412 ///
413 /// \headerfile <x86intrin.h>
414 ///
415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
416 ///
417 /// \param __a
418 /// A 128-bit vector of [4 x float] containing the first source operand. The
419 /// one's complement of this value is used in the bitwise AND.
420 /// \param __b
421 /// A 128-bit vector of [4 x float] containing the second source operand.
422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423 /// one's complement of the first operand and the values in the second
424 /// operand.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS
426 _mm_andnot_ps(__m128 __a, __m128 __b)
427 {
428  return (__m128)(~(__v4su)__a & (__v4su)__b);
429 }
430 
431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
432 ///
433 /// \headerfile <x86intrin.h>
434 ///
435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
436 ///
437 /// \param __a
438 /// A 128-bit vector of [4 x float] containing one of the source operands.
439 /// \param __b
440 /// A 128-bit vector of [4 x float] containing one of the source operands.
441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442 /// values between both operands.
443 static __inline__ __m128 __DEFAULT_FN_ATTRS
444 _mm_or_ps(__m128 __a, __m128 __b)
445 {
446  return (__m128)((__v4su)__a | (__v4su)__b);
447 }
448 
449 /// Performs a bitwise exclusive OR of two 128-bit vectors of
450 /// [4 x float].
451 ///
452 /// \headerfile <x86intrin.h>
453 ///
454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
455 ///
456 /// \param __a
457 /// A 128-bit vector of [4 x float] containing one of the source operands.
458 /// \param __b
459 /// A 128-bit vector of [4 x float] containing one of the source operands.
460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461 /// of the values between both operands.
462 static __inline__ __m128 __DEFAULT_FN_ATTRS
463 _mm_xor_ps(__m128 __a, __m128 __b)
464 {
465  return (__m128)((__v4su)__a ^ (__v4su)__b);
466 }
467 
468 /// Compares two 32-bit float values in the low-order bits of both
469 /// operands for equality and returns the result of the comparison in the
470 /// low-order bits of a vector [4 x float].
471 ///
472 /// \headerfile <x86intrin.h>
473 ///
474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
475 ///
476 /// \param __a
477 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
478 /// 32 bits of this operand are used in the comparison.
479 /// \param __b
480 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
481 /// 32 bits of this operand are used in the comparison.
482 /// \returns A 128-bit vector of [4 x float] containing the comparison results
483 /// in the low-order bits.
484 static __inline__ __m128 __DEFAULT_FN_ATTRS
485 _mm_cmpeq_ss(__m128 __a, __m128 __b)
486 {
487  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
488 }
489 
490 /// Compares each of the corresponding 32-bit float values of the
491 /// 128-bit vectors of [4 x float] for equality.
492 ///
493 /// \headerfile <x86intrin.h>
494 ///
495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
496 ///
497 /// \param __a
498 /// A 128-bit vector of [4 x float].
499 /// \param __b
500 /// A 128-bit vector of [4 x float].
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
502 static __inline__ __m128 __DEFAULT_FN_ATTRS
503 _mm_cmpeq_ps(__m128 __a, __m128 __b)
504 {
505  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
506 }
507 
508 /// Compares two 32-bit float values in the low-order bits of both
509 /// operands to determine if the value in the first operand is less than the
510 /// corresponding value in the second operand and returns the result of the
511 /// comparison in the low-order bits of a vector of [4 x float].
512 ///
513 /// \headerfile <x86intrin.h>
514 ///
515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
516 ///
517 /// \param __a
518 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
519 /// 32 bits of this operand are used in the comparison.
520 /// \param __b
521 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
522 /// 32 bits of this operand are used in the comparison.
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results
524 /// in the low-order bits.
525 static __inline__ __m128 __DEFAULT_FN_ATTRS
526 _mm_cmplt_ss(__m128 __a, __m128 __b)
527 {
528  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
529 }
530 
531 /// Compares each of the corresponding 32-bit float values of the
532 /// 128-bit vectors of [4 x float] to determine if the values in the first
533 /// operand are less than those in the second operand.
534 ///
535 /// \headerfile <x86intrin.h>
536 ///
537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
538 ///
539 /// \param __a
540 /// A 128-bit vector of [4 x float].
541 /// \param __b
542 /// A 128-bit vector of [4 x float].
543 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
544 static __inline__ __m128 __DEFAULT_FN_ATTRS
545 _mm_cmplt_ps(__m128 __a, __m128 __b)
546 {
547  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
548 }
549 
550 /// Compares two 32-bit float values in the low-order bits of both
551 /// operands to determine if the value in the first operand is less than or
552 /// equal to the corresponding value in the second operand and returns the
553 /// result of the comparison in the low-order bits of a vector of
554 /// [4 x float].
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
559 ///
560 /// \param __a
561 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
562 /// 32 bits of this operand are used in the comparison.
563 /// \param __b
564 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
565 /// 32 bits of this operand are used in the comparison.
566 /// \returns A 128-bit vector of [4 x float] containing the comparison results
567 /// in the low-order bits.
568 static __inline__ __m128 __DEFAULT_FN_ATTRS
569 _mm_cmple_ss(__m128 __a, __m128 __b)
570 {
571  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
572 }
573 
574 /// Compares each of the corresponding 32-bit float values of the
575 /// 128-bit vectors of [4 x float] to determine if the values in the first
576 /// operand are less than or equal to those in the second operand.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
581 ///
582 /// \param __a
583 /// A 128-bit vector of [4 x float].
584 /// \param __b
585 /// A 128-bit vector of [4 x float].
586 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
587 static __inline__ __m128 __DEFAULT_FN_ATTRS
588 _mm_cmple_ps(__m128 __a, __m128 __b)
589 {
590  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
591 }
592 
593 /// Compares two 32-bit float values in the low-order bits of both
594 /// operands to determine if the value in the first operand is greater than
595 /// the corresponding value in the second operand and returns the result of
596 /// the comparison in the low-order bits of a vector of [4 x float].
597 ///
598 /// \headerfile <x86intrin.h>
599 ///
600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
601 ///
602 /// \param __a
603 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
604 /// 32 bits of this operand are used in the comparison.
605 /// \param __b
606 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
607 /// 32 bits of this operand are used in the comparison.
608 /// \returns A 128-bit vector of [4 x float] containing the comparison results
609 /// in the low-order bits.
610 static __inline__ __m128 __DEFAULT_FN_ATTRS
611 _mm_cmpgt_ss(__m128 __a, __m128 __b)
612 {
613  return (__m128)__builtin_shufflevector((__v4sf)__a,
614  (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
615  4, 1, 2, 3);
616 }
617 
618 /// Compares each of the corresponding 32-bit float values of the
619 /// 128-bit vectors of [4 x float] to determine if the values in the first
620 /// operand are greater than those in the second operand.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
625 ///
626 /// \param __a
627 /// A 128-bit vector of [4 x float].
628 /// \param __b
629 /// A 128-bit vector of [4 x float].
630 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
631 static __inline__ __m128 __DEFAULT_FN_ATTRS
632 _mm_cmpgt_ps(__m128 __a, __m128 __b)
633 {
634  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
635 }
636 
637 /// Compares two 32-bit float values in the low-order bits of both
638 /// operands to determine if the value in the first operand is greater than
639 /// or equal to the corresponding value in the second operand and returns
640 /// the result of the comparison in the low-order bits of a vector of
641 /// [4 x float].
642 ///
643 /// \headerfile <x86intrin.h>
644 ///
645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
646 ///
647 /// \param __a
648 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
649 /// 32 bits of this operand are used in the comparison.
650 /// \param __b
651 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
652 /// 32 bits of this operand are used in the comparison.
653 /// \returns A 128-bit vector of [4 x float] containing the comparison results
654 /// in the low-order bits.
655 static __inline__ __m128 __DEFAULT_FN_ATTRS
656 _mm_cmpge_ss(__m128 __a, __m128 __b)
657 {
658  return (__m128)__builtin_shufflevector((__v4sf)__a,
659  (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
660  4, 1, 2, 3);
661 }
662 
663 /// Compares each of the corresponding 32-bit float values of the
664 /// 128-bit vectors of [4 x float] to determine if the values in the first
665 /// operand are greater than or equal to those in the second operand.
666 ///
667 /// \headerfile <x86intrin.h>
668 ///
669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
670 ///
671 /// \param __a
672 /// A 128-bit vector of [4 x float].
673 /// \param __b
674 /// A 128-bit vector of [4 x float].
675 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
676 static __inline__ __m128 __DEFAULT_FN_ATTRS
677 _mm_cmpge_ps(__m128 __a, __m128 __b)
678 {
679  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
680 }
681 
682 /// Compares two 32-bit float values in the low-order bits of both
683 /// operands for inequality and returns the result of the comparison in the
684 /// low-order bits of a vector of [4 x float].
685 ///
686 /// \headerfile <x86intrin.h>
687 ///
688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
689 /// instructions.
690 ///
691 /// \param __a
692 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
693 /// 32 bits of this operand are used in the comparison.
694 /// \param __b
695 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
696 /// 32 bits of this operand are used in the comparison.
697 /// \returns A 128-bit vector of [4 x float] containing the comparison results
698 /// in the low-order bits.
699 static __inline__ __m128 __DEFAULT_FN_ATTRS
700 _mm_cmpneq_ss(__m128 __a, __m128 __b)
701 {
702  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
703 }
704 
705 /// Compares each of the corresponding 32-bit float values of the
706 /// 128-bit vectors of [4 x float] for inequality.
707 ///
708 /// \headerfile <x86intrin.h>
709 ///
710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
711 /// instructions.
712 ///
713 /// \param __a
714 /// A 128-bit vector of [4 x float].
715 /// \param __b
716 /// A 128-bit vector of [4 x float].
717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
718 static __inline__ __m128 __DEFAULT_FN_ATTRS
719 _mm_cmpneq_ps(__m128 __a, __m128 __b)
720 {
721  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
722 }
723 
724 /// Compares two 32-bit float values in the low-order bits of both
725 /// operands to determine if the value in the first operand is not less than
726 /// the corresponding value in the second operand and returns the result of
727 /// the comparison in the low-order bits of a vector of [4 x float].
728 ///
729 /// \headerfile <x86intrin.h>
730 ///
731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
732 /// instructions.
733 ///
734 /// \param __a
735 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
736 /// 32 bits of this operand are used in the comparison.
737 /// \param __b
738 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
739 /// 32 bits of this operand are used in the comparison.
740 /// \returns A 128-bit vector of [4 x float] containing the comparison results
741 /// in the low-order bits.
742 static __inline__ __m128 __DEFAULT_FN_ATTRS
743 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
744 {
745  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
746 }
747 
748 /// Compares each of the corresponding 32-bit float values of the
749 /// 128-bit vectors of [4 x float] to determine if the values in the first
750 /// operand are not less than those in the second operand.
751 ///
752 /// \headerfile <x86intrin.h>
753 ///
754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
755 /// instructions.
756 ///
757 /// \param __a
758 /// A 128-bit vector of [4 x float].
759 /// \param __b
760 /// A 128-bit vector of [4 x float].
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
762 static __inline__ __m128 __DEFAULT_FN_ATTRS
763 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
764 {
765  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
766 }
767 
768 /// Compares two 32-bit float values in the low-order bits of both
769 /// operands to determine if the value in the first operand is not less than
770 /// or equal to the corresponding value in the second operand and returns
771 /// the result of the comparison in the low-order bits of a vector of
772 /// [4 x float].
773 ///
774 /// \headerfile <x86intrin.h>
775 ///
776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
777 /// instructions.
778 ///
779 /// \param __a
780 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
781 /// 32 bits of this operand are used in the comparison.
782 /// \param __b
783 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
784 /// 32 bits of this operand are used in the comparison.
785 /// \returns A 128-bit vector of [4 x float] containing the comparison results
786 /// in the low-order bits.
787 static __inline__ __m128 __DEFAULT_FN_ATTRS
788 _mm_cmpnle_ss(__m128 __a, __m128 __b)
789 {
790  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
791 }
792 
793 /// Compares each of the corresponding 32-bit float values of the
794 /// 128-bit vectors of [4 x float] to determine if the values in the first
795 /// operand are not less than or equal to those in the second operand.
796 ///
797 /// \headerfile <x86intrin.h>
798 ///
799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
800 /// instructions.
801 ///
802 /// \param __a
803 /// A 128-bit vector of [4 x float].
804 /// \param __b
805 /// A 128-bit vector of [4 x float].
806 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
807 static __inline__ __m128 __DEFAULT_FN_ATTRS
808 _mm_cmpnle_ps(__m128 __a, __m128 __b)
809 {
810  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
811 }
812 
813 /// Compares two 32-bit float values in the low-order bits of both
814 /// operands to determine if the value in the first operand is not greater
815 /// than the corresponding value in the second operand and returns the
816 /// result of the comparison in the low-order bits of a vector of
817 /// [4 x float].
818 ///
819 /// \headerfile <x86intrin.h>
820 ///
821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
822 /// instructions.
823 ///
824 /// \param __a
825 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
826 /// 32 bits of this operand are used in the comparison.
827 /// \param __b
828 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
829 /// 32 bits of this operand are used in the comparison.
830 /// \returns A 128-bit vector of [4 x float] containing the comparison results
831 /// in the low-order bits.
832 static __inline__ __m128 __DEFAULT_FN_ATTRS
833 _mm_cmpngt_ss(__m128 __a, __m128 __b)
834 {
835  return (__m128)__builtin_shufflevector((__v4sf)__a,
836  (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
837  4, 1, 2, 3);
838 }
839 
840 /// Compares each of the corresponding 32-bit float values of the
841 /// 128-bit vectors of [4 x float] to determine if the values in the first
842 /// operand are not greater than those in the second operand.
843 ///
844 /// \headerfile <x86intrin.h>
845 ///
846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
847 /// instructions.
848 ///
849 /// \param __a
850 /// A 128-bit vector of [4 x float].
851 /// \param __b
852 /// A 128-bit vector of [4 x float].
853 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
854 static __inline__ __m128 __DEFAULT_FN_ATTRS
855 _mm_cmpngt_ps(__m128 __a, __m128 __b)
856 {
857  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
858 }
859 
860 /// Compares two 32-bit float values in the low-order bits of both
861 /// operands to determine if the value in the first operand is not greater
862 /// than or equal to the corresponding value in the second operand and
863 /// returns the result of the comparison in the low-order bits of a vector
864 /// of [4 x float].
865 ///
866 /// \headerfile <x86intrin.h>
867 ///
868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
869 /// instructions.
870 ///
871 /// \param __a
872 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
873 /// 32 bits of this operand are used in the comparison.
874 /// \param __b
875 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
876 /// 32 bits of this operand are used in the comparison.
877 /// \returns A 128-bit vector of [4 x float] containing the comparison results
878 /// in the low-order bits.
879 static __inline__ __m128 __DEFAULT_FN_ATTRS
880 _mm_cmpnge_ss(__m128 __a, __m128 __b)
881 {
882  return (__m128)__builtin_shufflevector((__v4sf)__a,
883  (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
884  4, 1, 2, 3);
885 }
886 
887 /// Compares each of the corresponding 32-bit float values of the
888 /// 128-bit vectors of [4 x float] to determine if the values in the first
889 /// operand are not greater than or equal to those in the second operand.
890 ///
891 /// \headerfile <x86intrin.h>
892 ///
893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
894 /// instructions.
895 ///
896 /// \param __a
897 /// A 128-bit vector of [4 x float].
898 /// \param __b
899 /// A 128-bit vector of [4 x float].
900 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
901 static __inline__ __m128 __DEFAULT_FN_ATTRS
902 _mm_cmpnge_ps(__m128 __a, __m128 __b)
903 {
904  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
905 }
906 
907 /// Compares two 32-bit float values in the low-order bits of both
908 /// operands to determine if the value in the first operand is ordered with
909 /// respect to the corresponding value in the second operand and returns the
910 /// result of the comparison in the low-order bits of a vector of
911 /// [4 x float].
912 ///
913 /// \headerfile <x86intrin.h>
914 ///
915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
916 /// instructions.
917 ///
918 /// \param __a
919 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
920 /// 32 bits of this operand are used in the comparison.
921 /// \param __b
922 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
923 /// 32 bits of this operand are used in the comparison.
924 /// \returns A 128-bit vector of [4 x float] containing the comparison results
925 /// in the low-order bits.
926 static __inline__ __m128 __DEFAULT_FN_ATTRS
927 _mm_cmpord_ss(__m128 __a, __m128 __b)
928 {
929  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
930 }
931 
932 /// Compares each of the corresponding 32-bit float values of the
933 /// 128-bit vectors of [4 x float] to determine if the values in the first
934 /// operand are ordered with respect to those in the second operand.
935 ///
936 /// \headerfile <x86intrin.h>
937 ///
938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
939 /// instructions.
940 ///
941 /// \param __a
942 /// A 128-bit vector of [4 x float].
943 /// \param __b
944 /// A 128-bit vector of [4 x float].
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
946 static __inline__ __m128 __DEFAULT_FN_ATTRS
947 _mm_cmpord_ps(__m128 __a, __m128 __b)
948 {
949  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
950 }
951 
952 /// Compares two 32-bit float values in the low-order bits of both
953 /// operands to determine if the value in the first operand is unordered
954 /// with respect to the corresponding value in the second operand and
955 /// returns the result of the comparison in the low-order bits of a vector
956 /// of [4 x float].
957 ///
958 /// \headerfile <x86intrin.h>
959 ///
960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
961 /// instructions.
962 ///
963 /// \param __a
964 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
965 /// 32 bits of this operand are used in the comparison.
966 /// \param __b
967 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
968 /// 32 bits of this operand are used in the comparison.
969 /// \returns A 128-bit vector of [4 x float] containing the comparison results
970 /// in the low-order bits.
971 static __inline__ __m128 __DEFAULT_FN_ATTRS
972 _mm_cmpunord_ss(__m128 __a, __m128 __b)
973 {
974  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
975 }
976 
977 /// Compares each of the corresponding 32-bit float values of the
978 /// 128-bit vectors of [4 x float] to determine if the values in the first
979 /// operand are unordered with respect to those in the second operand.
980 ///
981 /// \headerfile <x86intrin.h>
982 ///
983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
984 /// instructions.
985 ///
986 /// \param __a
987 /// A 128-bit vector of [4 x float].
988 /// \param __b
989 /// A 128-bit vector of [4 x float].
990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
991 static __inline__ __m128 __DEFAULT_FN_ATTRS
992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
993 {
994  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995 }
996 
997 /// Compares two 32-bit float values in the low-order bits of both
998 /// operands for equality and returns the result of the comparison.
999 ///
1000 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005 /// instructions.
1006 ///
1007 /// \param __a
1008 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 /// used in the comparison.
1010 /// \param __b
1011 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012 /// used in the comparison.
1013 /// \returns An integer containing the comparison results. If either of the
1014 /// two lower 32-bit values is NaN, 0 is returned.
1015 static __inline__ int __DEFAULT_FN_ATTRS
1016 _mm_comieq_ss(__m128 __a, __m128 __b)
1017 {
1018  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019 }
1020 
1021 /// Compares two 32-bit float values in the low-order bits of both
1022 /// operands to determine if the first operand is less than the second
1023 /// operand and returns the result of the comparison.
1024 ///
1025 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1026 ///
1027 /// \headerfile <x86intrin.h>
1028 ///
1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030 /// instructions.
1031 ///
1032 /// \param __a
1033 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034 /// used in the comparison.
1035 /// \param __b
1036 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037 /// used in the comparison.
1038 /// \returns An integer containing the comparison results. If either of the two
1039 /// lower 32-bit values is NaN, 0 is returned.
1040 static __inline__ int __DEFAULT_FN_ATTRS
1041 _mm_comilt_ss(__m128 __a, __m128 __b)
1042 {
1043  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044 }
1045 
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 /// operands to determine if the first operand is less than or equal to the
1048 /// second operand and returns the result of the comparison.
1049 ///
1050 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1051 ///
1052 /// \headerfile <x86intrin.h>
1053 ///
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1055 ///
1056 /// \param __a
1057 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058 /// used in the comparison.
1059 /// \param __b
1060 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061 /// used in the comparison.
1062 /// \returns An integer containing the comparison results. If either of the two
1063 /// lower 32-bit values is NaN, 0 is returned.
1064 static __inline__ int __DEFAULT_FN_ATTRS
1065 _mm_comile_ss(__m128 __a, __m128 __b)
1066 {
1067  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068 }
1069 
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 /// operands to determine if the first operand is greater than the second
1072 /// operand and returns the result of the comparison.
1073 ///
1074 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1075 ///
1076 /// \headerfile <x86intrin.h>
1077 ///
1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1079 ///
1080 /// \param __a
1081 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082 /// used in the comparison.
1083 /// \param __b
1084 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 /// used in the comparison.
1086 /// \returns An integer containing the comparison results. If either of the
1087 /// two lower 32-bit values is NaN, 0 is returned.
1088 static __inline__ int __DEFAULT_FN_ATTRS
1089 _mm_comigt_ss(__m128 __a, __m128 __b)
1090 {
1091  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092 }
1093 
1094 /// Compares two 32-bit float values in the low-order bits of both
1095 /// operands to determine if the first operand is greater than or equal to
1096 /// the second operand and returns the result of the comparison.
1097 ///
1098 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 /// used in the comparison.
1107 /// \param __b
1108 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 /// used in the comparison.
1110 /// \returns An integer containing the comparison results. If either of the two
1111 /// lower 32-bit values is NaN, 0 is returned.
1112 static __inline__ int __DEFAULT_FN_ATTRS
1113 _mm_comige_ss(__m128 __a, __m128 __b)
1114 {
1115  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116 }
1117 
1118 /// Compares two 32-bit float values in the low-order bits of both
1119 /// operands to determine if the first operand is not equal to the second
1120 /// operand and returns the result of the comparison.
1121 ///
1122 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1123 ///
1124 /// \headerfile <x86intrin.h>
1125 ///
1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1127 ///
1128 /// \param __a
1129 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 /// used in the comparison.
1131 /// \param __b
1132 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133 /// used in the comparison.
1134 /// \returns An integer containing the comparison results. If either of the
1135 /// two lower 32-bit values is NaN, 1 is returned.
1136 static __inline__ int __DEFAULT_FN_ATTRS
1137 _mm_comineq_ss(__m128 __a, __m128 __b)
1138 {
1139  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140 }
1141 
1142 /// Performs an unordered comparison of two 32-bit float values using
1143 /// the low-order bits of both operands to determine equality and returns
1144 /// the result of the comparison.
1145 ///
1146 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1147 ///
1148 /// \headerfile <x86intrin.h>
1149 ///
1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1151 ///
1152 /// \param __a
1153 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154 /// used in the comparison.
1155 /// \param __b
1156 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157 /// used in the comparison.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 /// lower 32-bit values is NaN, 0 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
1161 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1162 {
1163  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164 }
1165 
1166 /// Performs an unordered comparison of two 32-bit float values using
1167 /// the low-order bits of both operands to determine if the first operand is
1168 /// less than the second operand and returns the result of the comparison.
1169 ///
1170 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1171 ///
1172 /// \headerfile <x86intrin.h>
1173 ///
1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1175 ///
1176 /// \param __a
1177 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 /// used in the comparison.
1179 /// \param __b
1180 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181 /// used in the comparison.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 /// lower 32-bit values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1186 {
1187  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188 }
1189 
1190 /// Performs an unordered comparison of two 32-bit float values using
1191 /// the low-order bits of both operands to determine if the first operand is
1192 /// less than or equal to the second operand and returns the result of the
1193 /// comparison.
1194 ///
1195 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1196 ///
1197 /// \headerfile <x86intrin.h>
1198 ///
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200 ///
1201 /// \param __a
1202 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 /// used in the comparison.
1204 /// \param __b
1205 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 /// used in the comparison.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 /// lower 32-bit values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
1210 _mm_ucomile_ss(__m128 __a, __m128 __b)
1211 {
1212  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213 }
1214 
1215 /// Performs an unordered comparison of two 32-bit float values using
1216 /// the low-order bits of both operands to determine if the first operand is
1217 /// greater than the second operand and returns the result of the
1218 /// comparison.
1219 ///
1220 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1225 ///
1226 /// \param __a
1227 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228 /// used in the comparison.
1229 /// \param __b
1230 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 /// used in the comparison.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 /// lower 32-bit values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
1235 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1236 {
1237  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238 }
1239 
1240 /// Performs an unordered comparison of two 32-bit float values using
1241 /// the low-order bits of both operands to determine if the first operand is
1242 /// greater than or equal to the second operand and returns the result of
1243 /// the comparison.
1244 ///
1245 /// If either of the two lower 32-bit values is NaN, 0 is returned.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250 ///
1251 /// \param __a
1252 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 /// used in the comparison.
1254 /// \param __b
1255 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 /// used in the comparison.
1257 /// \returns An integer containing the comparison results. If either of the two
1258 /// lower 32-bit values is NaN, 0 is returned.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomige_ss(__m128 __a, __m128 __b)
1261 {
1262  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263 }
1264 
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 /// the low-order bits of both operands to determine inequality and returns
1267 /// the result of the comparison.
1268 ///
1269 /// If either of the two lower 32-bit values is NaN, 1 is returned.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274 ///
1275 /// \param __a
1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 /// used in the comparison.
1278 /// \param __b
1279 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 /// used in the comparison.
1281 /// \returns An integer containing the comparison results. If either of the two
1282 /// lower 32-bit values is NaN, 1 is returned.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1285 {
1286  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287 }
1288 
1289 /// Converts a float value contained in the lower 32 bits of a vector of
1290 /// [4 x float] into a 32-bit integer.
1291 ///
1292 /// \headerfile <x86intrin.h>
1293 ///
1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295 /// instructions.
1296 ///
1297 /// \param __a
1298 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 /// used in the conversion.
1300 /// \returns A 32-bit integer containing the converted value.
1301 static __inline__ int __DEFAULT_FN_ATTRS
1303 {
1304  return __builtin_ia32_cvtss2si((__v4sf)__a);
1305 }
1306 
1307 /// Converts a float value contained in the lower 32 bits of a vector of
1308 /// [4 x float] into a 32-bit integer.
1309 ///
1310 /// \headerfile <x86intrin.h>
1311 ///
1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313 /// instructions.
1314 ///
1315 /// \param __a
1316 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317 /// used in the conversion.
1318 /// \returns A 32-bit integer containing the converted value.
1319 static __inline__ int __DEFAULT_FN_ATTRS
1321 {
1322  return _mm_cvtss_si32(__a);
1323 }
1324 
1325 #ifdef __x86_64__
1326 
1327 /// Converts a float value contained in the lower 32 bits of a vector of
1328 /// [4 x float] into a 64-bit integer.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333 /// instructions.
1334 ///
1335 /// \param __a
1336 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337 /// used in the conversion.
1338 /// \returns A 64-bit integer containing the converted value.
1339 static __inline__ long long __DEFAULT_FN_ATTRS
1340 _mm_cvtss_si64(__m128 __a)
1341 {
1342  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343 }
1344 
1345 #endif
1346 
1347 /// Converts two low-order float values in a 128-bit vector of
1348 /// [4 x float] into a 64-bit vector of [2 x i32].
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1353 ///
1354 /// \param __a
1355 /// A 128-bit vector of [4 x float].
1356 /// \returns A 64-bit integer vector containing the converted values.
1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1358 _mm_cvtps_pi32(__m128 __a)
1359 {
1360  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361 }
1362 
1363 /// Converts two low-order float values in a 128-bit vector of
1364 /// [4 x float] into a 64-bit vector of [2 x i32].
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1369 ///
1370 /// \param __a
1371 /// A 128-bit vector of [4 x float].
1372 /// \returns A 64-bit integer vector containing the converted values.
1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1374 _mm_cvt_ps2pi(__m128 __a)
1375 {
1376  return _mm_cvtps_pi32(__a);
1377 }
1378 
1379 /// Converts a float value contained in the lower 32 bits of a vector of
1380 /// [4 x float] into a 32-bit integer, truncating the result when it is
1381 /// inexact.
1382 ///
1383 /// \headerfile <x86intrin.h>
1384 ///
1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386 /// instructions.
1387 ///
1388 /// \param __a
1389 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390 /// used in the conversion.
1391 /// \returns A 32-bit integer containing the converted value.
1392 static __inline__ int __DEFAULT_FN_ATTRS
1393 _mm_cvttss_si32(__m128 __a)
1394 {
1395  return __builtin_ia32_cvttss2si((__v4sf)__a);
1396 }
1397 
1398 /// Converts a float value contained in the lower 32 bits of a vector of
1399 /// [4 x float] into a 32-bit integer, truncating the result when it is
1400 /// inexact.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405 /// instructions.
1406 ///
1407 /// \param __a
1408 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409 /// used in the conversion.
1410 /// \returns A 32-bit integer containing the converted value.
1411 static __inline__ int __DEFAULT_FN_ATTRS
1412 _mm_cvtt_ss2si(__m128 __a)
1413 {
1414  return _mm_cvttss_si32(__a);
1415 }
1416 
1417 #ifdef __x86_64__
1418 /// Converts a float value contained in the lower 32 bits of a vector of
1419 /// [4 x float] into a 64-bit integer, truncating the result when it is
1420 /// inexact.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425 /// instructions.
1426 ///
1427 /// \param __a
1428 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429 /// used in the conversion.
1430 /// \returns A 64-bit integer containing the converted value.
1431 static __inline__ long long __DEFAULT_FN_ATTRS
1432 _mm_cvttss_si64(__m128 __a)
1433 {
1434  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1435 }
1436 #endif
1437 
1438 /// Converts two low-order float values in a 128-bit vector of
1439 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440 /// when it is inexact.
1441 ///
1442 /// \headerfile <x86intrin.h>
1443 ///
1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445 /// instructions.
1446 ///
1447 /// \param __a
1448 /// A 128-bit vector of [4 x float].
1449 /// \returns A 64-bit integer vector containing the converted values.
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1451 _mm_cvttps_pi32(__m128 __a)
1452 {
1453  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454 }
1455 
1456 /// Converts two low-order float values in a 128-bit vector of [4 x
1457 /// float] into a 64-bit vector of [2 x i32], truncating the result when it
1458 /// is inexact.
1459 ///
1460 /// \headerfile <x86intrin.h>
1461 ///
1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1463 ///
1464 /// \param __a
1465 /// A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1468 _mm_cvtt_ps2pi(__m128 __a)
1469 {
1470  return _mm_cvttps_pi32(__a);
1471 }
1472 
1473 /// Converts a 32-bit signed integer value into a floating point value
1474 /// and writes it to the lower 32 bits of the destination. The remaining
1475 /// higher order elements of the destination vector are copied from the
1476 /// corresponding elements in the first operand.
1477 ///
1478 /// \headerfile <x86intrin.h>
1479 ///
1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1481 ///
1482 /// \param __a
1483 /// A 128-bit vector of [4 x float].
1484 /// \param __b
1485 /// A 32-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 /// converted value of the second operand. The upper 96 bits are copied from
1488 /// the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi32_ss(__m128 __a, int __b)
1491 {
1492  __a[0] = __b;
1493  return __a;
1494 }
1495 
1496 /// Converts a 32-bit signed integer value into a floating point value
1497 /// and writes it to the lower 32 bits of the destination. The remaining
1498 /// higher order elements of the destination are copied from the
1499 /// corresponding elements in the first operand.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1504 ///
1505 /// \param __a
1506 /// A 128-bit vector of [4 x float].
1507 /// \param __b
1508 /// A 32-bit signed integer operand containing the value to be converted.
1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510 /// converted value of the second operand. The upper 96 bits are copied from
1511 /// the upper 96 bits of the first operand.
1512 static __inline__ __m128 __DEFAULT_FN_ATTRS
1513 _mm_cvt_si2ss(__m128 __a, int __b)
1514 {
1515  return _mm_cvtsi32_ss(__a, __b);
1516 }
1517 
1518 #ifdef __x86_64__
1519 
1520 /// Converts a 64-bit signed integer value into a floating point value
1521 /// and writes it to the lower 32 bits of the destination. The remaining
1522 /// higher order elements of the destination are copied from the
1523 /// corresponding elements in the first operand.
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1528 ///
1529 /// \param __a
1530 /// A 128-bit vector of [4 x float].
1531 /// \param __b
1532 /// A 64-bit signed integer operand containing the value to be converted.
1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534 /// converted value of the second operand. The upper 96 bits are copied from
1535 /// the upper 96 bits of the first operand.
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS
1537 _mm_cvtsi64_ss(__m128 __a, long long __b)
1538 {
1539  __a[0] = __b;
1540  return __a;
1541 }
1542 
1543 #endif
1544 
1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1546 /// floating point values and writes them to the lower 64-bits of the
1547 /// destination. The remaining higher order elements of the destination are
1548 /// copied from the corresponding elements in the first operand.
1549 ///
1550 /// \headerfile <x86intrin.h>
1551 ///
1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1553 ///
1554 /// \param __a
1555 /// A 128-bit vector of [4 x float].
1556 /// \param __b
1557 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558 /// and written to the corresponding low-order elements in the destination.
1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560 /// converted value of the second operand. The upper 64 bits are copied from
1561 /// the upper 64 bits of the first operand.
1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564 {
1565  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566 }
1567 
1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1569 /// floating point values and writes them to the lower 64-bits of the
1570 /// destination. The remaining higher order elements of the destination are
1571 /// copied from the corresponding elements in the first operand.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1576 ///
1577 /// \param __a
1578 /// A 128-bit vector of [4 x float].
1579 /// \param __b
1580 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581 /// and written to the corresponding low-order elements in the destination.
1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583 /// converted value from the second operand. The upper 64 bits are copied
1584 /// from the upper 64 bits of the first operand.
1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587 {
1588  return _mm_cvtpi32_ps(__a, __b);
1589 }
1590 
1591 /// Extracts a float value contained in the lower 32 bits of a vector of
1592 /// [4 x float].
1593 ///
1594 /// \headerfile <x86intrin.h>
1595 ///
1596 /// This intrinsic has no corresponding instruction.
1597 ///
1598 /// \param __a
1599 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600 /// used in the extraction.
1601 /// \returns A 32-bit float containing the extracted value.
1602 static __inline__ float __DEFAULT_FN_ATTRS
1603 _mm_cvtss_f32(__m128 __a)
1604 {
1605  return __a[0];
1606 }
1607 
1608 /// Loads two packed float values from the address \a __p into the
1609 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610 /// are copied from the low-order bits of the first operand.
1611 ///
1612 /// \headerfile <x86intrin.h>
1613 ///
1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1615 ///
1616 /// \param __a
1617 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618 /// of the destination.
1619 /// \param __p
1620 /// A pointer to two packed float values. Bits [63:0] are written to bits
1621 /// [127:64] of the destination.
1622 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1623 static __inline__ __m128 __DEFAULT_FN_ATTRS
1624 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1625 {
1626  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627  struct __mm_loadh_pi_struct {
1628  __mm_loadh_pi_v2f32 __u;
1629  } __attribute__((__packed__, __may_alias__));
1630  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1631  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633 }
1634 
1635 /// Loads two packed float values from the address \a __p into the
1636 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637 /// are copied from the high-order bits of the first operand.
1638 ///
1639 /// \headerfile <x86intrin.h>
1640 ///
1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1642 ///
1643 /// \param __a
1644 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645 /// [127:64] of the destination.
1646 /// \param __p
1647 /// A pointer to two packed float values. Bits [63:0] are written to bits
1648 /// [63:0] of the destination.
1649 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1650 static __inline__ __m128 __DEFAULT_FN_ATTRS
1651 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1652 {
1653  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654  struct __mm_loadl_pi_struct {
1655  __mm_loadl_pi_v2f32 __u;
1656  } __attribute__((__packed__, __may_alias__));
1657  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1658  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660 }
1661 
1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663 /// 32 bits of the vector are initialized with the single-precision
1664 /// floating-point value loaded from a specified memory location. The upper
1665 /// 96 bits are set to zero.
1666 ///
1667 /// \headerfile <x86intrin.h>
1668 ///
1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1670 ///
1671 /// \param __p
1672 /// A pointer to a 32-bit memory location containing a single-precision
1673 /// floating-point value.
1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675 /// lower 32 bits contain the value loaded from the memory location. The
1676 /// upper 96 bits are set to zero.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load_ss(const float *__p)
1679 {
1680  struct __mm_load_ss_struct {
1681  float __u;
1682  } __attribute__((__packed__, __may_alias__));
1683  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1684  return __extension__ (__m128){ __u, 0, 0, 0 };
1685 }
1686 
1687 /// Loads a 32-bit float value and duplicates it to all four vector
1688 /// elements of a 128-bit vector of [4 x float].
1689 ///
1690 /// \headerfile <x86intrin.h>
1691 ///
1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1693 /// instruction.
1694 ///
1695 /// \param __p
1696 /// A pointer to a float value to be loaded and duplicated.
1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1698 /// duplicated values.
1699 static __inline__ __m128 __DEFAULT_FN_ATTRS
1700 _mm_load1_ps(const float *__p)
1701 {
1702  struct __mm_load1_ps_struct {
1703  float __u;
1704  } __attribute__((__packed__, __may_alias__));
1705  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1706  return __extension__ (__m128){ __u, __u, __u, __u };
1707 }
1708 
1709 #define _mm_load_ps1(p) _mm_load1_ps(p)
1710 
1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712 /// memory location.
1713 ///
1714 /// \headerfile <x86intrin.h>
1715 ///
1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1717 ///
1718 /// \param __p
1719 /// A pointer to a 128-bit memory location. The address of the memory
1720 /// location has to be 128-bit aligned.
1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS
1723 _mm_load_ps(const float *__p)
1724 {
1725  return *(__m128*)__p;
1726 }
1727 
1728 /// Loads a 128-bit floating-point vector of [4 x float] from an
1729 /// unaligned memory location.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1734 ///
1735 /// \param __p
1736 /// A pointer to a 128-bit memory location. The address of the memory
1737 /// location does not have to be aligned.
1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadu_ps(const float *__p)
1741 {
1742  struct __loadu_ps {
1743  __m128_u __v;
1744  } __attribute__((__packed__, __may_alias__));
1745  return ((struct __loadu_ps*)__p)->__v;
1746 }
1747 
1748 /// Loads four packed float values, in reverse order, from an aligned
1749 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1754 /// instruction.
1755 ///
1756 /// \param __p
1757 /// A pointer to a 128-bit memory location. The address of the memory
1758 /// location has to be 128-bit aligned.
1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760 /// in reverse order.
1761 static __inline__ __m128 __DEFAULT_FN_ATTRS
1762 _mm_loadr_ps(const float *__p)
1763 {
1764  __m128 __a = _mm_load_ps(__p);
1765  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766 }
1767 
1768 /// Create a 128-bit vector of [4 x float] with undefined values.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic has no corresponding instruction.
1773 ///
1774 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1775 static __inline__ __m128 __DEFAULT_FN_ATTRS
1777 {
1778  return (__m128)__builtin_ia32_undef128();
1779 }
1780 
1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782 /// 32 bits of the vector are initialized with the specified single-precision
1783 /// floating-point value. The upper 96 bits are set to zero.
1784 ///
1785 /// \headerfile <x86intrin.h>
1786 ///
1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1788 ///
1789 /// \param __w
1790 /// A single-precision floating-point value used to initialize the lower 32
1791 /// bits of the result.
1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793 /// lower 32 bits contain the value provided in the source operand. The
1794 /// upper 96 bits are set to zero.
1795 static __inline__ __m128 __DEFAULT_FN_ATTRS
1796 _mm_set_ss(float __w)
1797 {
1798  return __extension__ (__m128){ __w, 0, 0, 0 };
1799 }
1800 
1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1802 /// of the four single-precision floating-point vector elements set to the
1803 /// specified single-precision floating-point value.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1808 ///
1809 /// \param __w
1810 /// A single-precision floating-point value used to initialize each vector
1811 /// element of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set1_ps(float __w)
1815 {
1816  return __extension__ (__m128){ __w, __w, __w, __w };
1817 }
1818 
1819 /* Microsoft specific. */
1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1821 /// of the four single-precision floating-point vector elements set to the
1822 /// specified single-precision floating-point value.
1823 ///
1824 /// \headerfile <x86intrin.h>
1825 ///
1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1827 ///
1828 /// \param __w
1829 /// A single-precision floating-point value used to initialize each vector
1830 /// element of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1832 static __inline__ __m128 __DEFAULT_FN_ATTRS
1833 _mm_set_ps1(float __w)
1834 {
1835  return _mm_set1_ps(__w);
1836 }
1837 
1838 /// Constructs a 128-bit floating-point vector of [4 x float]
1839 /// initialized with the specified single-precision floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic is a utility function and does not correspond to a specific
1844 /// instruction.
1845 ///
1846 /// \param __z
1847 /// A single-precision floating-point value used to initialize bits [127:96]
1848 /// of the result.
1849 /// \param __y
1850 /// A single-precision floating-point value used to initialize bits [95:64]
1851 /// of the result.
1852 /// \param __x
1853 /// A single-precision floating-point value used to initialize bits [63:32]
1854 /// of the result.
1855 /// \param __w
1856 /// A single-precision floating-point value used to initialize bits [31:0]
1857 /// of the result.
1858 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1859 static __inline__ __m128 __DEFAULT_FN_ATTRS
1860 _mm_set_ps(float __z, float __y, float __x, float __w)
1861 {
1862  return __extension__ (__m128){ __w, __x, __y, __z };
1863 }
1864 
1865 /// Constructs a 128-bit floating-point vector of [4 x float],
1866 /// initialized in reverse order with the specified 32-bit single-precision
1867 /// float-point values.
1868 ///
1869 /// \headerfile <x86intrin.h>
1870 ///
1871 /// This intrinsic is a utility function and does not correspond to a specific
1872 /// instruction.
1873 ///
1874 /// \param __z
1875 /// A single-precision floating-point value used to initialize bits [31:0]
1876 /// of the result.
1877 /// \param __y
1878 /// A single-precision floating-point value used to initialize bits [63:32]
1879 /// of the result.
1880 /// \param __x
1881 /// A single-precision floating-point value used to initialize bits [95:64]
1882 /// of the result.
1883 /// \param __w
1884 /// A single-precision floating-point value used to initialize bits [127:96]
1885 /// of the result.
1886 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1887 static __inline__ __m128 __DEFAULT_FN_ATTRS
1888 _mm_setr_ps(float __z, float __y, float __x, float __w)
1889 {
1890  return __extension__ (__m128){ __z, __y, __x, __w };
1891 }
1892 
1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1894 /// to zero.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1899 ///
1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901 /// all elements set to zero.
1902 static __inline__ __m128 __DEFAULT_FN_ATTRS
1904 {
1905  return __extension__ (__m128){ 0, 0, 0, 0 };
1906 }
1907 
1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909 /// memory location.
1910 ///
1911 /// \headerfile <x86intrin.h>
1912 ///
1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1914 ///
1915 /// \param __p
1916 /// A pointer to a 64-bit memory location.
1917 /// \param __a
1918 /// A 128-bit vector of [4 x float] containing the values to be stored.
1919 static __inline__ void __DEFAULT_FN_ATTRS
1920 _mm_storeh_pi(__m64 *__p, __m128 __a)
1921 {
1922  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1923 }
1924 
1925 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1926 /// memory location.
1927 ///
1928 /// \headerfile <x86intrin.h>
1929 ///
1930 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1931 ///
1932 /// \param __p
1933 /// A pointer to a memory location that will receive the float values.
1934 /// \param __a
1935 /// A 128-bit vector of [4 x float] containing the values to be stored.
1936 static __inline__ void __DEFAULT_FN_ATTRS
1937 _mm_storel_pi(__m64 *__p, __m128 __a)
1938 {
1939  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1940 }
1941 
1942 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1943 /// memory location.
1944 ///
1945 /// \headerfile <x86intrin.h>
1946 ///
1947 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1948 ///
1949 /// \param __p
1950 /// A pointer to a 32-bit memory location.
1951 /// \param __a
1952 /// A 128-bit vector of [4 x float] containing the value to be stored.
1953 static __inline__ void __DEFAULT_FN_ATTRS
1954 _mm_store_ss(float *__p, __m128 __a)
1955 {
1956  struct __mm_store_ss_struct {
1957  float __u;
1958  } __attribute__((__packed__, __may_alias__));
1959  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1960 }
1961 
1962 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
1963 /// location.
1964 ///
1965 /// \headerfile <x86intrin.h>
1966 ///
1967 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1968 ///
1969 /// \param __p
1970 /// A pointer to a 128-bit memory location. The address of the memory
1971 /// location does not have to be aligned.
1972 /// \param __a
1973 /// A 128-bit vector of [4 x float] containing the values to be stored.
1974 static __inline__ void __DEFAULT_FN_ATTRS
1975 _mm_storeu_ps(float *__p, __m128 __a)
1976 {
1977  struct __storeu_ps {
1978  __m128_u __v;
1979  } __attribute__((__packed__, __may_alias__));
1980  ((struct __storeu_ps*)__p)->__v = __a;
1981 }
1982 
1983 /// Stores a 128-bit vector of [4 x float] into an aligned memory
1984 /// location.
1985 ///
1986 /// \headerfile <x86intrin.h>
1987 ///
1988 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1989 ///
1990 /// \param __p
1991 /// A pointer to a 128-bit memory location. The address of the memory
1992 /// location has to be 16-byte aligned.
1993 /// \param __a
1994 /// A 128-bit vector of [4 x float] containing the values to be stored.
1995 static __inline__ void __DEFAULT_FN_ATTRS
1996 _mm_store_ps(float *__p, __m128 __a)
1997 {
1998  *(__m128*)__p = __a;
1999 }
2000 
2001 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2002 /// four contiguous elements in an aligned memory location.
2003 ///
2004 /// \headerfile <x86intrin.h>
2005 ///
2006 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2007 /// instruction.
2008 ///
2009 /// \param __p
2010 /// A pointer to a 128-bit memory location.
2011 /// \param __a
2012 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2013 /// of the four contiguous elements pointed by \a __p.
2014 static __inline__ void __DEFAULT_FN_ATTRS
2015 _mm_store1_ps(float *__p, __m128 __a)
2016 {
2017  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2018  _mm_store_ps(__p, __a);
2019 }
2020 
2021 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2022 /// four contiguous elements in an aligned memory location.
2023 ///
2024 /// \headerfile <x86intrin.h>
2025 ///
2026 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2027 /// instruction.
2028 ///
2029 /// \param __p
2030 /// A pointer to a 128-bit memory location.
2031 /// \param __a
2032 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2033 /// of the four contiguous elements pointed by \a __p.
2034 static __inline__ void __DEFAULT_FN_ATTRS
2035 _mm_store_ps1(float *__p, __m128 __a)
2036 {
2037  _mm_store1_ps(__p, __a);
2038 }
2039 
2040 /// Stores float values from a 128-bit vector of [4 x float] to an
2041 /// aligned memory location in reverse order.
2042 ///
2043 /// \headerfile <x86intrin.h>
2044 ///
2045 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2046 /// instruction.
2047 ///
2048 /// \param __p
2049 /// A pointer to a 128-bit memory location. The address of the memory
2050 /// location has to be 128-bit aligned.
2051 /// \param __a
2052 /// A 128-bit vector of [4 x float] containing the values to be stored.
2053 static __inline__ void __DEFAULT_FN_ATTRS
2054 _mm_storer_ps(float *__p, __m128 __a)
2055 {
2056  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2057  _mm_store_ps(__p, __a);
2058 }
2059 
2060 #define _MM_HINT_ET0 7
2061 #define _MM_HINT_ET1 6
2062 #define _MM_HINT_T0 3
2063 #define _MM_HINT_T1 2
2064 #define _MM_HINT_T2 1
2065 #define _MM_HINT_NTA 0
2066 
2067 #ifndef _MSC_VER
2068 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2069  Sema doesn't do any form of constant propagation yet. */
2070 
2071 /// Loads one cache line of data from the specified address to a location
2072 /// closer to the processor.
2073 ///
2074 /// \headerfile <x86intrin.h>
2075 ///
2076 /// \code
2077 /// void _mm_prefetch(const void * a, const int sel);
2078 /// \endcode
2079 ///
2080 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2081 ///
2082 /// \param a
2083 /// A pointer to a memory location containing a cache line of data.
2084 /// \param sel
2085 /// A predefined integer constant specifying the type of prefetch
2086 /// operation: \n
2087 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2088 /// PREFETCHNTA instruction will be generated. \n
2089 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2090 /// be generated. \n
2091 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2092 /// be generated. \n
2093 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2094 /// be generated.
2095 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
2096  ((sel) >> 2) & 1, (sel) & 0x3))
2097 #endif
2098 
2099 /// Stores a 64-bit integer in the specified aligned memory location. To
2100 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2101 /// used again soon).
2102 ///
2103 /// \headerfile <x86intrin.h>
2104 ///
2105 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2106 ///
2107 /// \param __p
2108 /// A pointer to an aligned memory location used to store the register value.
2109 /// \param __a
2110 /// A 64-bit integer containing the value to be stored.
2111 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2112 _mm_stream_pi(__m64 *__p, __m64 __a)
2113 {
2114  __builtin_ia32_movntq(__p, __a);
2115 }
2116 
2117 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2118 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2119 /// as non-temporal (unlikely to be used again soon).
2120 ///
2121 /// \headerfile <x86intrin.h>
2122 ///
2123 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2124 ///
2125 /// \param __p
2126 /// A pointer to a 128-bit aligned memory location that will receive the
2127 /// single-precision floating-point values.
2128 /// \param __a
2129 /// A 128-bit vector of [4 x float] containing the values to be moved.
2130 static __inline__ void __DEFAULT_FN_ATTRS
2131 _mm_stream_ps(float *__p, __m128 __a)
2132 {
2133  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2134 }
2135 
2136 #if defined(__cplusplus)
2137 extern "C" {
2138 #endif
2139 
2140 /// Forces strong memory ordering (serialization) between store
2141 /// instructions preceding this instruction and store instructions following
2142 /// this instruction, ensuring the system completes all previous stores
2143 /// before executing subsequent stores.
2144 ///
2145 /// \headerfile <x86intrin.h>
2146 ///
2147 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2148 ///
2149 void _mm_sfence(void);
2150 
2151 #if defined(__cplusplus)
2152 } // extern "C"
2153 #endif
2154 
2155 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2156 /// returns it, as specified by the immediate integer operand.
2157 ///
2158 /// \headerfile <x86intrin.h>
2159 ///
2160 /// \code
2161 /// int _mm_extract_pi16(__m64 a, int n);
2162 /// \endcode
2163 ///
2164 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2165 ///
2166 /// \param a
2167 /// A 64-bit vector of [4 x i16].
2168 /// \param n
2169 /// An immediate integer operand that determines which bits are extracted: \n
2170 /// 0: Bits [15:0] are copied to the destination. \n
2171 /// 1: Bits [31:16] are copied to the destination. \n
2172 /// 2: Bits [47:32] are copied to the destination. \n
2173 /// 3: Bits [63:48] are copied to the destination.
2174 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2175 #define _mm_extract_pi16(a, n) \
2176  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
2177 
2178 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2179 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2180 /// specified by the immediate operand \a n.
2181 ///
2182 /// \headerfile <x86intrin.h>
2183 ///
2184 /// \code
2185 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2186 /// \endcode
2187 ///
2188 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2189 ///
2190 /// \param a
2191 /// A 64-bit vector of [4 x i16].
2192 /// \param d
2193 /// An integer. The lower 16-bit value from this operand is written to the
2194 /// destination at the offset specified by operand \a n.
2195 /// \param n
2196 /// An immediate integer operant that determines which the bits to be used
2197 /// in the destination. \n
2198 /// 0: Bits [15:0] are copied to the destination. \n
2199 /// 1: Bits [31:16] are copied to the destination. \n
2200 /// 2: Bits [47:32] are copied to the destination. \n
2201 /// 3: Bits [63:48] are copied to the destination. \n
2202 /// The remaining bits in the destination are copied from the corresponding
2203 /// bits in operand \a a.
2204 /// \returns A 64-bit integer vector containing the copied packed data from the
2205 /// operands.
2206 #define _mm_insert_pi16(a, d, n) \
2207  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
2208 
2209 /// Compares each of the corresponding packed 16-bit integer values of
2210 /// the 64-bit integer vectors, and writes the greater value to the
2211 /// corresponding bits in the destination.
2212 ///
2213 /// \headerfile <x86intrin.h>
2214 ///
2215 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2216 ///
2217 /// \param __a
2218 /// A 64-bit integer vector containing one of the source operands.
2219 /// \param __b
2220 /// A 64-bit integer vector containing one of the source operands.
2221 /// \returns A 64-bit integer vector containing the comparison results.
2222 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2223 _mm_max_pi16(__m64 __a, __m64 __b)
2224 {
2225  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2226 }
2227 
2228 /// Compares each of the corresponding packed 8-bit unsigned integer
2229 /// values of the 64-bit integer vectors, and writes the greater value to the
2230 /// corresponding bits in the destination.
2231 ///
2232 /// \headerfile <x86intrin.h>
2233 ///
2234 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2235 ///
2236 /// \param __a
2237 /// A 64-bit integer vector containing one of the source operands.
2238 /// \param __b
2239 /// A 64-bit integer vector containing one of the source operands.
2240 /// \returns A 64-bit integer vector containing the comparison results.
2241 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2242 _mm_max_pu8(__m64 __a, __m64 __b)
2243 {
2244  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2245 }
2246 
2247 /// Compares each of the corresponding packed 16-bit integer values of
2248 /// the 64-bit integer vectors, and writes the lesser value to the
2249 /// corresponding bits in the destination.
2250 ///
2251 /// \headerfile <x86intrin.h>
2252 ///
2253 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2254 ///
2255 /// \param __a
2256 /// A 64-bit integer vector containing one of the source operands.
2257 /// \param __b
2258 /// A 64-bit integer vector containing one of the source operands.
2259 /// \returns A 64-bit integer vector containing the comparison results.
2260 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2261 _mm_min_pi16(__m64 __a, __m64 __b)
2262 {
2263  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2264 }
2265 
2266 /// Compares each of the corresponding packed 8-bit unsigned integer
2267 /// values of the 64-bit integer vectors, and writes the lesser value to the
2268 /// corresponding bits in the destination.
2269 ///
2270 /// \headerfile <x86intrin.h>
2271 ///
2272 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2273 ///
2274 /// \param __a
2275 /// A 64-bit integer vector containing one of the source operands.
2276 /// \param __b
2277 /// A 64-bit integer vector containing one of the source operands.
2278 /// \returns A 64-bit integer vector containing the comparison results.
2279 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2280 _mm_min_pu8(__m64 __a, __m64 __b)
2281 {
2282  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2283 }
2284 
2285 /// Takes the most significant bit from each 8-bit element in a 64-bit
2286 /// integer vector to create an 8-bit mask value. Zero-extends the value to
2287 /// 32-bit integer and writes it to the destination.
2288 ///
2289 /// \headerfile <x86intrin.h>
2290 ///
2291 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2292 ///
2293 /// \param __a
2294 /// A 64-bit integer vector containing the values with bits to be extracted.
2295 /// \returns The most significant bit from each 8-bit element in \a __a,
2296 /// written to bits [7:0].
2297 static __inline__ int __DEFAULT_FN_ATTRS_MMX
2299 {
2300  return __builtin_ia32_pmovmskb((__v8qi)__a);
2301 }
2302 
2303 /// Multiplies packed 16-bit unsigned integer values and writes the
2304 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2305 /// the destination.
2306 ///
2307 /// \headerfile <x86intrin.h>
2308 ///
2309 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2310 ///
2311 /// \param __a
2312 /// A 64-bit integer vector containing one of the source operands.
2313 /// \param __b
2314 /// A 64-bit integer vector containing one of the source operands.
2315 /// \returns A 64-bit integer vector containing the products of both operands.
2316 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2317 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2318 {
2319  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2320 }
2321 
2322 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2323 /// destination, as specified by the immediate value operand.
2324 ///
2325 /// \headerfile <x86intrin.h>
2326 ///
2327 /// \code
2328 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2329 /// \endcode
2330 ///
2331 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2332 ///
2333 /// \param a
2334 /// A 64-bit integer vector containing the values to be shuffled.
2335 /// \param n
2336 /// An immediate value containing an 8-bit value specifying which elements to
2337 /// copy from \a a. The destinations within the 64-bit destination are
2338 /// assigned values as follows: \n
2339 /// Bits [1:0] are used to assign values to bits [15:0] in the
2340 /// destination. \n
2341 /// Bits [3:2] are used to assign values to bits [31:16] in the
2342 /// destination. \n
2343 /// Bits [5:4] are used to assign values to bits [47:32] in the
2344 /// destination. \n
2345 /// Bits [7:6] are used to assign values to bits [63:48] in the
2346 /// destination. \n
2347 /// Bit value assignments: \n
2348 /// 00: assigned from bits [15:0] of \a a. \n
2349 /// 01: assigned from bits [31:16] of \a a. \n
2350 /// 10: assigned from bits [47:32] of \a a. \n
2351 /// 11: assigned from bits [63:48] of \a a.
2352 /// \returns A 64-bit integer vector containing the shuffled values.
2353 #define _mm_shuffle_pi16(a, n) \
2354  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2355 
2356 /// Conditionally copies the values from each 8-bit element in the first
2357 /// 64-bit integer vector operand to the specified memory location, as
2358 /// specified by the most significant bit in the corresponding element in the
2359 /// second 64-bit integer vector operand.
2360 ///
2361 /// To minimize caching, the data is flagged as non-temporal
2362 /// (unlikely to be used again soon).
2363 ///
2364 /// \headerfile <x86intrin.h>
2365 ///
2366 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2367 ///
2368 /// \param __d
2369 /// A 64-bit integer vector containing the values with elements to be copied.
2370 /// \param __n
2371 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2372 /// element determines whether the corresponding element in operand \a __d
2373 /// is copied. If the most significant bit of a given element is 1, the
2374 /// corresponding element in operand \a __d is copied.
2375 /// \param __p
2376 /// A pointer to a 64-bit memory location that will receive the conditionally
2377 /// copied integer values. The address of the memory location does not have
2378 /// to be aligned.
2379 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2380 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2381 {
2382  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2383 }
2384 
2385 /// Computes the rounded averages of the packed unsigned 8-bit integer
2386 /// values and writes the averages to the corresponding bits in the
2387 /// destination.
2388 ///
2389 /// \headerfile <x86intrin.h>
2390 ///
2391 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2392 ///
2393 /// \param __a
2394 /// A 64-bit integer vector containing one of the source operands.
2395 /// \param __b
2396 /// A 64-bit integer vector containing one of the source operands.
2397 /// \returns A 64-bit integer vector containing the averages of both operands.
2398 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2399 _mm_avg_pu8(__m64 __a, __m64 __b)
2400 {
2401  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2402 }
2403 
2404 /// Computes the rounded averages of the packed unsigned 16-bit integer
2405 /// values and writes the averages to the corresponding bits in the
2406 /// destination.
2407 ///
2408 /// \headerfile <x86intrin.h>
2409 ///
2410 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2411 ///
2412 /// \param __a
2413 /// A 64-bit integer vector containing one of the source operands.
2414 /// \param __b
2415 /// A 64-bit integer vector containing one of the source operands.
2416 /// \returns A 64-bit integer vector containing the averages of both operands.
2417 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2418 _mm_avg_pu16(__m64 __a, __m64 __b)
2419 {
2420  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2421 }
2422 
2423 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2424 /// 64-bit vector operands and computes the absolute value for each of the
2425 /// difference. Then sum of the 8 absolute differences is written to the
2426 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2427 ///
2428 /// \headerfile <x86intrin.h>
2429 ///
2430 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2431 ///
2432 /// \param __a
2433 /// A 64-bit integer vector containing one of the source operands.
2434 /// \param __b
2435 /// A 64-bit integer vector containing one of the source operands.
2436 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2437 /// sets of absolute differences between both operands. The upper bits are
2438 /// cleared.
2439 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2440 _mm_sad_pu8(__m64 __a, __m64 __b)
2441 {
2442  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2443 }
2444 
2445 #if defined(__cplusplus)
2446 extern "C" {
2447 #endif
2448 
2449 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2450 /// integer value.
2451 ///
2452 /// There are several groups of macros associated with this
2453 /// intrinsic, including:
2454 /// <ul>
2455 /// <li>
2456 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2457 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2458 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2459 /// _MM_GET_EXCEPTION_STATE().
2460 /// </li>
2461 /// <li>
2462 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2463 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2464 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2465 /// </li>
2466 /// <li>
2467 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2468 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2469 /// _MM_GET_ROUNDING_MODE().
2470 /// </li>
2471 /// <li>
2472 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2473 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2474 /// </li>
2475 /// <li>
2476 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2477 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2478 /// _MM_GET_DENORMALS_ZERO_MODE().
2479 /// </li>
2480 /// </ul>
2481 ///
2482 /// For example, the following expression checks if an overflow exception has
2483 /// occurred:
2484 /// \code
2485 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2486 /// \endcode
2487 ///
2488 /// The following expression gets the current rounding mode:
2489 /// \code
2490 /// _MM_GET_ROUNDING_MODE()
2491 /// \endcode
2492 ///
2493 /// \headerfile <x86intrin.h>
2494 ///
2495 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2496 ///
2497 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2498 /// register.
2499 unsigned int _mm_getcsr(void);
2500 
2501 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2502 ///
2503 /// There are several groups of macros associated with this intrinsic,
2504 /// including:
2505 /// <ul>
2506 /// <li>
2507 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2508 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2509 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2510 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2511 /// </li>
2512 /// <li>
2513 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2514 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2515 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2516 /// of these macros.
2517 /// </li>
2518 /// <li>
2519 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2520 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2521 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2522 /// </li>
2523 /// <li>
2524 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2525 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2526 /// one of these macros.
2527 /// </li>
2528 /// <li>
2529 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2530 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2531 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2532 /// </li>
2533 /// </ul>
2534 ///
2535 /// For example, the following expression causes subsequent floating-point
2536 /// operations to round up:
2537 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2538 ///
2539 /// The following example sets the DAZ and FTZ flags:
2540 /// \code
2541 /// void setFlags() {
2542 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2543 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2544 /// }
2545 /// \endcode
2546 ///
2547 /// \headerfile <x86intrin.h>
2548 ///
2549 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2550 ///
2551 /// \param __i
2552 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2553 void _mm_setcsr(unsigned int __i);
2554 
2555 #if defined(__cplusplus)
2556 } // extern "C"
2557 #endif
2558 
2559 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2560 /// specified by the immediate value operand.
2561 ///
2562 /// \headerfile <x86intrin.h>
2563 ///
2564 /// \code
2565 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2566 /// \endcode
2567 ///
2568 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2569 ///
2570 /// \param a
2571 /// A 128-bit vector of [4 x float].
2572 /// \param b
2573 /// A 128-bit vector of [4 x float].
2574 /// \param mask
2575 /// An immediate value containing an 8-bit value specifying which elements to
2576 /// copy from \a a and \a b. \n
2577 /// Bits [3:0] specify the values copied from operand \a a. \n
2578 /// Bits [7:4] specify the values copied from operand \a b. \n
2579 /// The destinations within the 128-bit destination are assigned values as
2580 /// follows: \n
2581 /// Bits [1:0] are used to assign values to bits [31:0] in the
2582 /// destination. \n
2583 /// Bits [3:2] are used to assign values to bits [63:32] in the
2584 /// destination. \n
2585 /// Bits [5:4] are used to assign values to bits [95:64] in the
2586 /// destination. \n
2587 /// Bits [7:6] are used to assign values to bits [127:96] in the
2588 /// destination. \n
2589 /// Bit value assignments: \n
2590 /// 00: Bits [31:0] copied from the specified operand. \n
2591 /// 01: Bits [63:32] copied from the specified operand. \n
2592 /// 10: Bits [95:64] copied from the specified operand. \n
2593 /// 11: Bits [127:96] copied from the specified operand.
2594 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2595 #define _mm_shuffle_ps(a, b, mask) \
2596  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2597  (int)(mask))
2598 
2599 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2600 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2601 ///
2602 /// \headerfile <x86intrin.h>
2603 ///
2604 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2605 ///
2606 /// \param __a
2607 /// A 128-bit vector of [4 x float]. \n
2608 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2609 /// Bits [127:96] are written to bits [95:64] of the destination.
2610 /// \param __b
2611 /// A 128-bit vector of [4 x float].
2612 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2613 /// Bits [127:96] are written to bits [127:96] of the destination.
2614 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2615 static __inline__ __m128 __DEFAULT_FN_ATTRS
2616 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2617 {
2618  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2619 }
2620 
2621 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2622 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2623 ///
2624 /// \headerfile <x86intrin.h>
2625 ///
2626 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2627 ///
2628 /// \param __a
2629 /// A 128-bit vector of [4 x float]. \n
2630 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2631 /// Bits [63:32] are written to bits [95:64] of the destination.
2632 /// \param __b
2633 /// A 128-bit vector of [4 x float]. \n
2634 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2635 /// Bits [63:32] are written to bits [127:96] of the destination.
2636 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2637 static __inline__ __m128 __DEFAULT_FN_ATTRS
2638 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2639 {
2640  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2641 }
2642 
2643 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2644 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2645 /// 96 bits are set to the upper 96 bits of the first parameter.
2646 ///
2647 /// \headerfile <x86intrin.h>
2648 ///
2649 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2650 /// instruction.
2651 ///
2652 /// \param __a
2653 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2654 /// written to the upper 96 bits of the result.
2655 /// \param __b
2656 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2657 /// written to the lower 32 bits of the result.
2658 /// \returns A 128-bit floating-point vector of [4 x float].
2659 static __inline__ __m128 __DEFAULT_FN_ATTRS
2660 _mm_move_ss(__m128 __a, __m128 __b)
2661 {
2662  __a[0] = __b[0];
2663  return __a;
2664 }
2665 
2666 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2667 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2668 /// 64 bits are set to the upper 64 bits of the first parameter.
2669 ///
2670 /// \headerfile <x86intrin.h>
2671 ///
2672 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2673 ///
2674 /// \param __a
2675 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2676 /// written to the upper 64 bits of the result.
2677 /// \param __b
2678 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2679 /// written to the lower 64 bits of the result.
2680 /// \returns A 128-bit floating-point vector of [4 x float].
2681 static __inline__ __m128 __DEFAULT_FN_ATTRS
2682 _mm_movehl_ps(__m128 __a, __m128 __b)
2683 {
2684  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2685 }
2686 
2687 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2688 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2689 /// 64 bits are set to the lower 64 bits of the second parameter.
2690 ///
2691 /// \headerfile <x86intrin.h>
2692 ///
2693 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2694 ///
2695 /// \param __a
2696 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2697 /// written to the lower 64 bits of the result.
2698 /// \param __b
2699 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2700 /// written to the upper 64 bits of the result.
2701 /// \returns A 128-bit floating-point vector of [4 x float].
2702 static __inline__ __m128 __DEFAULT_FN_ATTRS
2703 _mm_movelh_ps(__m128 __a, __m128 __b)
2704 {
2705  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2706 }
2707 
2708 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2709 /// float].
2710 ///
2711 /// \headerfile <x86intrin.h>
2712 ///
2713 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2714 ///
2715 /// \param __a
2716 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2717 /// from the corresponding elements in this operand.
2718 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2719 /// values from the operand.
2720 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2721 _mm_cvtpi16_ps(__m64 __a)
2722 {
2723  __m64 __b, __c;
2724  __m128 __r;
2725 
2726  __b = _mm_setzero_si64();
2727  __b = _mm_cmpgt_pi16(__b, __a);
2728  __c = _mm_unpackhi_pi16(__a, __b);
2729  __r = _mm_setzero_ps();
2730  __r = _mm_cvtpi32_ps(__r, __c);
2731  __r = _mm_movelh_ps(__r, __r);
2732  __c = _mm_unpacklo_pi16(__a, __b);
2733  __r = _mm_cvtpi32_ps(__r, __c);
2734 
2735  return __r;
2736 }
2737 
2738 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2739 /// 128-bit vector of [4 x float].
2740 ///
2741 /// \headerfile <x86intrin.h>
2742 ///
2743 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2744 ///
2745 /// \param __a
2746 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2747 /// destination are copied from the corresponding elements in this operand.
2748 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2749 /// values from the operand.
2750 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2751 _mm_cvtpu16_ps(__m64 __a)
2752 {
2753  __m64 __b, __c;
2754  __m128 __r;
2755 
2756  __b = _mm_setzero_si64();
2757  __c = _mm_unpackhi_pi16(__a, __b);
2758  __r = _mm_setzero_ps();
2759  __r = _mm_cvtpi32_ps(__r, __c);
2760  __r = _mm_movelh_ps(__r, __r);
2761  __c = _mm_unpacklo_pi16(__a, __b);
2762  __r = _mm_cvtpi32_ps(__r, __c);
2763 
2764  return __r;
2765 }
2766 
2767 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2768 /// into a 128-bit vector of [4 x float].
2769 ///
2770 /// \headerfile <x86intrin.h>
2771 ///
2772 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2773 ///
2774 /// \param __a
2775 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2776 /// from the corresponding lower 4 elements in this operand.
2777 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2778 /// values from the operand.
2779 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2780 _mm_cvtpi8_ps(__m64 __a)
2781 {
2782  __m64 __b;
2783 
2784  __b = _mm_setzero_si64();
2785  __b = _mm_cmpgt_pi8(__b, __a);
2786  __b = _mm_unpacklo_pi8(__a, __b);
2787 
2788  return _mm_cvtpi16_ps(__b);
2789 }
2790 
2791 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2792 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2793 ///
2794 /// \headerfile <x86intrin.h>
2795 ///
2796 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2797 ///
2798 /// \param __a
2799 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2800 /// destination are copied from the corresponding lower 4 elements in this
2801 /// operand.
2802 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2803 /// values from the source operand.
2804 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2805 _mm_cvtpu8_ps(__m64 __a)
2806 {
2807  __m64 __b;
2808 
2809  __b = _mm_setzero_si64();
2810  __b = _mm_unpacklo_pi8(__a, __b);
2811 
2812  return _mm_cvtpi16_ps(__b);
2813 }
2814 
2815 /// Converts the two 32-bit signed integer values from each 64-bit vector
2816 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2817 ///
2818 /// \headerfile <x86intrin.h>
2819 ///
2820 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2821 ///
2822 /// \param __a
2823 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2824 /// copied from the elements in this operand.
2825 /// \param __b
2826 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2827 /// copied from the elements in this operand.
2828 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2829 /// copied and converted values from the first operand. The upper 64 bits
2830 /// contain the copied and converted values from the second operand.
2831 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2832 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2833 {
2834  __m128 __c;
2835 
2836  __c = _mm_setzero_ps();
2837  __c = _mm_cvtpi32_ps(__c, __b);
2838  __c = _mm_movelh_ps(__c, __c);
2839 
2840  return _mm_cvtpi32_ps(__c, __a);
2841 }
2842 
2843 /// Converts each single-precision floating-point element of a 128-bit
2844 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2845 /// packs the results into a 64-bit integer vector of [4 x i16].
2846 ///
2847 /// If the floating-point element is NaN or infinity, or if the
2848 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2849 /// it is converted to 0x8000. Otherwise if the floating-point element is
2850 /// greater than 0x7FFF, it is converted to 0x7FFF.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2855 ///
2856 /// \param __a
2857 /// A 128-bit floating-point vector of [4 x float].
2858 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2859 /// values.
2860 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2861 _mm_cvtps_pi16(__m128 __a)
2862 {
2863  __m64 __b, __c;
2864 
2865  __b = _mm_cvtps_pi32(__a);
2866  __a = _mm_movehl_ps(__a, __a);
2867  __c = _mm_cvtps_pi32(__a);
2868 
2869  return _mm_packs_pi32(__b, __c);
2870 }
2871 
2872 /// Converts each single-precision floating-point element of a 128-bit
2873 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2874 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2875 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
2876 ///
2877 /// If the floating-point element is NaN or infinity, or if the
2878 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2879 /// is converted to 0x80. Otherwise if the floating-point element is greater
2880 /// than 0x7F, it is converted to 0x7F.
2881 ///
2882 /// \headerfile <x86intrin.h>
2883 ///
2884 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2885 ///
2886 /// \param __a
2887 /// 128-bit floating-point vector of [4 x float].
2888 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2889 /// converted values and the uppper 32 bits are set to zero.
2890 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2891 _mm_cvtps_pi8(__m128 __a)
2892 {
2893  __m64 __b, __c;
2894 
2895  __b = _mm_cvtps_pi16(__a);
2896  __c = _mm_setzero_si64();
2897 
2898  return _mm_packs_pi16(__b, __c);
2899 }
2900 
2901 /// Extracts the sign bits from each single-precision floating-point
2902 /// element of a 128-bit floating-point vector of [4 x float] and returns the
2903 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2904 /// to zero.
2905 ///
2906 /// \headerfile <x86intrin.h>
2907 ///
2908 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2909 ///
2910 /// \param __a
2911 /// A 128-bit floating-point vector of [4 x float].
2912 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2913 /// single-precision floating-point element of the parameter. Bits [31:4] are
2914 /// set to zero.
2915 static __inline__ int __DEFAULT_FN_ATTRS
2916 _mm_movemask_ps(__m128 __a)
2917 {
2918  return __builtin_ia32_movmskps((__v4sf)__a);
2919 }
2920 
2921 
2922 #define _MM_ALIGN16 __attribute__((aligned(16)))
2923 
2924 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2925 
2926 #define _MM_EXCEPT_INVALID (0x0001)
2927 #define _MM_EXCEPT_DENORM (0x0002)
2928 #define _MM_EXCEPT_DIV_ZERO (0x0004)
2929 #define _MM_EXCEPT_OVERFLOW (0x0008)
2930 #define _MM_EXCEPT_UNDERFLOW (0x0010)
2931 #define _MM_EXCEPT_INEXACT (0x0020)
2932 #define _MM_EXCEPT_MASK (0x003f)
2933 
2934 #define _MM_MASK_INVALID (0x0080)
2935 #define _MM_MASK_DENORM (0x0100)
2936 #define _MM_MASK_DIV_ZERO (0x0200)
2937 #define _MM_MASK_OVERFLOW (0x0400)
2938 #define _MM_MASK_UNDERFLOW (0x0800)
2939 #define _MM_MASK_INEXACT (0x1000)
2940 #define _MM_MASK_MASK (0x1f80)
2941 
2942 #define _MM_ROUND_NEAREST (0x0000)
2943 #define _MM_ROUND_DOWN (0x2000)
2944 #define _MM_ROUND_UP (0x4000)
2945 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2946 #define _MM_ROUND_MASK (0x6000)
2947 
2948 #define _MM_FLUSH_ZERO_MASK (0x8000)
2949 #define _MM_FLUSH_ZERO_ON (0x8000)
2950 #define _MM_FLUSH_ZERO_OFF (0x0000)
2951 
2952 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2953 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2954 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2955 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2956 
2957 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2958 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2959 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2960 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2961 
2962 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2963 do { \
2964  __m128 tmp3, tmp2, tmp1, tmp0; \
2965  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2966  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2967  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2968  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2969  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2970  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2971  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2972  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2973 } while (0)
2974 
2975 /* Aliases for compatibility. */
2976 #define _m_pextrw _mm_extract_pi16
2977 #define _m_pinsrw _mm_insert_pi16
2978 #define _m_pmaxsw _mm_max_pi16
2979 #define _m_pmaxub _mm_max_pu8
2980 #define _m_pminsw _mm_min_pi16
2981 #define _m_pminub _mm_min_pu8
2982 #define _m_pmovmskb _mm_movemask_pi8
2983 #define _m_pmulhuw _mm_mulhi_pu16
2984 #define _m_pshufw _mm_shuffle_pi16
2985 #define _m_maskmovq _mm_maskmove_si64
2986 #define _m_pavgb _mm_avg_pu8
2987 #define _m_pavgw _mm_avg_pu16
2988 #define _m_psadbw _mm_sad_pu8
2989 #define _m_ _mm_
2990 #define _m_ _mm_
2991 
2992 #undef __DEFAULT_FN_ATTRS
2993 #undef __DEFAULT_FN_ATTRS_MMX
2994 
2995 /* Ugly hack for backwards-compatibility (compatible with gcc) */
2996 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2997 #include <emmintrin.h>
2998 #endif
2999 
3000 #endif /* __XMMINTRIN_H */
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1954
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:972
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2440
static __inline unsigned char unsigned int unsigned int __y
Definition: adxintrin.h:22
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1563
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:308
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:325
struct __storeu_i16 *__P __v
Definition: immintrin.h:304
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1624
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:485
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2616
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1490
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1161
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:92
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:743
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:285
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values...
Definition: xmmintrin.h:344
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2242
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1833
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2015
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1700
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition. ...
Definition: xmmintrin.h:70
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1282
int __v4si __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:15
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:656
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location...
Definition: xmmintrin.h:2131
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2682
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts 16-bit signed integers from both 64-bit integer vector parameters of [4 x i16] into 8-bit si...
Definition: mmintrin.h:127
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:237
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1920
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2035
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:588
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2638
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1651
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2298
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:569
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:113
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1065
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2418
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1776
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:367
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2780
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1247
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1412
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:526
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values...
Definition: xmmintrin.h:386
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1814
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2660
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:833
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2054
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1975
static __inline__ void int __a
Definition: emmintrin.h:4185
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1586
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:947
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:135
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:231
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:719
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2223
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1225
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float]. ...
Definition: xmmintrin.h:1603
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:992
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:2916
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1185
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2317
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:808
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1903
#define __DEFAULT_FN_ATTRS_MMX
Definition: xmmintrin.h:32
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:266
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:855
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:1996
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2399
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:763
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:927
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1358
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:463
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2703
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:902
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1210
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1284
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:632
static __inline__ vector float vector float __b
Definition: altivec.h:520
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:214
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1796
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:177
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:1016
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1762
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1468
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:611
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2891
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1513
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1860
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:249
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:22
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:285
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:503
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2832
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:2751
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:545
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1113
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1393
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one&#39;s complement of the value...
Definition: xmmintrin.h:426
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1451
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:1888
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:155
#define __DEFAULT_FN_ATTRS
Definition: xmmintrin.h:31
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:196
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2380
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1137
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:677
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:444
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1320
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2280
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1041
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2805
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition: xmmintrin.h:700
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:404
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:880
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2261
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts 32-bit signed integers from both 64-bit integer vector parameters of [2 x i32] into 16-bit s...
Definition: mmintrin.h:157
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1235
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1740
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1260
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2721
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1723
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2861
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1937
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1374
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:788
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1089
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4185
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:50
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2112
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1302