clang  6.0.0svn
xmmintrin.h
Go to the documentation of this file.
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to deal
5  * in the Software without restriction, including without limitation the rights
6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7  * copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19  * THE SOFTWARE.
20  *
21  *===-----------------------------------------------------------------------===
22  */
23 
24 #ifndef __XMMINTRIN_H
25 #define __XMMINTRIN_H
26 
27 #include <mmintrin.h>
28 
29 typedef int __v4si __attribute__((__vector_size__(16)));
30 typedef float __v4sf __attribute__((__vector_size__(16)));
31 typedef float __m128 __attribute__((__vector_size__(16)));
32 
33 /* Unsigned types */
34 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
35 
36 /* This header should only be included in a hosted environment as it depends on
37  * a standard library to provide allocation routines. */
38 #if __STDC_HOSTED__
39 #include <mm_malloc.h>
40 #endif
41 
42 /* Define the default attributes for the functions in this file. */
43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
44 
45 /// \brief Adds the 32-bit float values in the low-order bits of the operands.
46 ///
47 /// \headerfile <x86intrin.h>
48 ///
49 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
50 ///
51 /// \param __a
52 /// A 128-bit vector of [4 x float] containing one of the source operands.
53 /// The lower 32 bits of this operand are used in the calculation.
54 /// \param __b
55 /// A 128-bit vector of [4 x float] containing one of the source operands.
56 /// The lower 32 bits of this operand are used in the calculation.
57 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
58 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
59 /// the upper 96 bits of the first source operand.
60 static __inline__ __m128 __DEFAULT_FN_ATTRS
61 _mm_add_ss(__m128 __a, __m128 __b)
62 {
63  __a[0] += __b[0];
64  return __a;
65 }
66 
67 /// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
68 /// the addition.
69 ///
70 /// \headerfile <x86intrin.h>
71 ///
72 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
73 ///
74 /// \param __a
75 /// A 128-bit vector of [4 x float] containing one of the source operands.
76 /// \param __b
77 /// A 128-bit vector of [4 x float] containing one of the source operands.
78 /// \returns A 128-bit vector of [4 x float] containing the sums of both
79 /// operands.
80 static __inline__ __m128 __DEFAULT_FN_ATTRS
81 _mm_add_ps(__m128 __a, __m128 __b)
82 {
83  return (__m128)((__v4sf)__a + (__v4sf)__b);
84 }
85 
86 /// \brief Subtracts the 32-bit float value in the low-order bits of the second
87 /// operand from the corresponding value in the first operand.
88 ///
89 /// \headerfile <x86intrin.h>
90 ///
91 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
92 ///
93 /// \param __a
94 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
95 /// of this operand are used in the calculation.
96 /// \param __b
97 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
98 /// bits of this operand are used in the calculation.
99 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
100 /// difference of the lower 32 bits of both operands. The upper 96 bits are
101 /// copied from the upper 96 bits of the first source operand.
102 static __inline__ __m128 __DEFAULT_FN_ATTRS
103 _mm_sub_ss(__m128 __a, __m128 __b)
104 {
105  __a[0] -= __b[0];
106  return __a;
107 }
108 
109 /// \brief Subtracts each of the values of the second operand from the first
110 /// operand, both of which are 128-bit vectors of [4 x float] and returns
111 /// the results of the subtraction.
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
116 ///
117 /// \param __a
118 /// A 128-bit vector of [4 x float] containing the minuend.
119 /// \param __b
120 /// A 128-bit vector of [4 x float] containing the subtrahend.
121 /// \returns A 128-bit vector of [4 x float] containing the differences between
122 /// both operands.
123 static __inline__ __m128 __DEFAULT_FN_ATTRS
124 _mm_sub_ps(__m128 __a, __m128 __b)
125 {
126  return (__m128)((__v4sf)__a - (__v4sf)__b);
127 }
128 
129 /// \brief Multiplies two 32-bit float values in the low-order bits of the
130 /// operands.
131 ///
132 /// \headerfile <x86intrin.h>
133 ///
134 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
135 ///
136 /// \param __a
137 /// A 128-bit vector of [4 x float] containing one of the source operands.
138 /// The lower 32 bits of this operand are used in the calculation.
139 /// \param __b
140 /// A 128-bit vector of [4 x float] containing one of the source operands.
141 /// The lower 32 bits of this operand are used in the calculation.
142 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
143 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
144 /// bits of the first source operand.
145 static __inline__ __m128 __DEFAULT_FN_ATTRS
146 _mm_mul_ss(__m128 __a, __m128 __b)
147 {
148  __a[0] *= __b[0];
149  return __a;
150 }
151 
152 /// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
153 /// results of the multiplication.
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
158 ///
159 /// \param __a
160 /// A 128-bit vector of [4 x float] containing one of the source operands.
161 /// \param __b
162 /// A 128-bit vector of [4 x float] containing one of the source operands.
163 /// \returns A 128-bit vector of [4 x float] containing the products of both
164 /// operands.
165 static __inline__ __m128 __DEFAULT_FN_ATTRS
166 _mm_mul_ps(__m128 __a, __m128 __b)
167 {
168  return (__m128)((__v4sf)__a * (__v4sf)__b);
169 }
170 
171 /// \brief Divides the value in the low-order 32 bits of the first operand by
172 /// the corresponding value in the second operand.
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
177 ///
178 /// \param __a
179 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
180 /// bits of this operand are used in the calculation.
181 /// \param __b
182 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
183 /// of this operand are used in the calculation.
184 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
185 /// lower 32 bits of both operands. The upper 96 bits are copied from the
186 /// upper 96 bits of the first source operand.
187 static __inline__ __m128 __DEFAULT_FN_ATTRS
188 _mm_div_ss(__m128 __a, __m128 __b)
189 {
190  __a[0] /= __b[0];
191  return __a;
192 }
193 
194 /// \brief Divides two 128-bit vectors of [4 x float].
195 ///
196 /// \headerfile <x86intrin.h>
197 ///
198 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
199 ///
200 /// \param __a
201 /// A 128-bit vector of [4 x float] containing the dividend.
202 /// \param __b
203 /// A 128-bit vector of [4 x float] containing the divisor.
204 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
205 /// operands.
206 static __inline__ __m128 __DEFAULT_FN_ATTRS
207 _mm_div_ps(__m128 __a, __m128 __b)
208 {
209  return (__m128)((__v4sf)__a / (__v4sf)__b);
210 }
211 
212 /// \brief Calculates the square root of the value stored in the low-order bits
213 /// of a 128-bit vector of [4 x float].
214 ///
215 /// \headerfile <x86intrin.h>
216 ///
217 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
218 ///
219 /// \param __a
220 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
221 /// used in the calculation.
222 /// \returns A 128-bit vector of [4 x float] containing the square root of the
223 /// value in the low-order bits of the operand.
224 static __inline__ __m128 __DEFAULT_FN_ATTRS
225 _mm_sqrt_ss(__m128 __a)
226 {
227  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
228  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
229 }
230 
231 /// \brief Calculates the square roots of the values stored in a 128-bit vector
232 /// of [4 x float].
233 ///
234 /// \headerfile <x86intrin.h>
235 ///
236 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
237 ///
238 /// \param __a
239 /// A 128-bit vector of [4 x float].
240 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
241 /// values in the operand.
242 static __inline__ __m128 __DEFAULT_FN_ATTRS
243 _mm_sqrt_ps(__m128 __a)
244 {
245  return __builtin_ia32_sqrtps((__v4sf)__a);
246 }
247 
248 /// \brief Calculates the approximate reciprocal of the value stored in the
249 /// low-order bits of a 128-bit vector of [4 x float].
250 ///
251 /// \headerfile <x86intrin.h>
252 ///
253 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
254 ///
255 /// \param __a
256 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
257 /// used in the calculation.
258 /// \returns A 128-bit vector of [4 x float] containing the approximate
259 /// reciprocal of the value in the low-order bits of the operand.
260 static __inline__ __m128 __DEFAULT_FN_ATTRS
261 _mm_rcp_ss(__m128 __a)
262 {
263  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
264  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
265 }
266 
267 /// \brief Calculates the approximate reciprocals of the values stored in a
268 /// 128-bit vector of [4 x float].
269 ///
270 /// \headerfile <x86intrin.h>
271 ///
272 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
273 ///
274 /// \param __a
275 /// A 128-bit vector of [4 x float].
276 /// \returns A 128-bit vector of [4 x float] containing the approximate
277 /// reciprocals of the values in the operand.
278 static __inline__ __m128 __DEFAULT_FN_ATTRS
279 _mm_rcp_ps(__m128 __a)
280 {
281  return __builtin_ia32_rcpps((__v4sf)__a);
282 }
283 
284 /// \brief Calculates the approximate reciprocal of the square root of the value
285 /// stored in the low-order bits of a 128-bit vector of [4 x float].
286 ///
287 /// \headerfile <x86intrin.h>
288 ///
289 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
290 ///
291 /// \param __a
292 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
293 /// used in the calculation.
294 /// \returns A 128-bit vector of [4 x float] containing the approximate
295 /// reciprocal of the square root of the value in the low-order bits of the
296 /// operand.
297 static __inline__ __m128 __DEFAULT_FN_ATTRS
298 _mm_rsqrt_ss(__m128 __a)
299 {
300  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
301  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
302 }
303 
304 /// \brief Calculates the approximate reciprocals of the square roots of the
305 /// values stored in a 128-bit vector of [4 x float].
306 ///
307 /// \headerfile <x86intrin.h>
308 ///
309 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
310 ///
311 /// \param __a
312 /// A 128-bit vector of [4 x float].
313 /// \returns A 128-bit vector of [4 x float] containing the approximate
314 /// reciprocals of the square roots of the values in the operand.
315 static __inline__ __m128 __DEFAULT_FN_ATTRS
316 _mm_rsqrt_ps(__m128 __a)
317 {
318  return __builtin_ia32_rsqrtps((__v4sf)__a);
319 }
320 
321 /// \brief Compares two 32-bit float values in the low-order bits of both
322 /// operands and returns the lesser value in the low-order bits of the
323 /// vector of [4 x float].
324 ///
325 /// \headerfile <x86intrin.h>
326 ///
327 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
328 ///
329 /// \param __a
330 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
331 /// 32 bits of this operand are used in the comparison.
332 /// \param __b
333 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
334 /// 32 bits of this operand are used in the comparison.
335 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
336 /// minimum value between both operands. The upper 96 bits are copied from
337 /// the upper 96 bits of the first source operand.
338 static __inline__ __m128 __DEFAULT_FN_ATTRS
339 _mm_min_ss(__m128 __a, __m128 __b)
340 {
341  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
342 }
343 
344 /// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
345 /// of each pair of values.
346 ///
347 /// \headerfile <x86intrin.h>
348 ///
349 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
350 ///
351 /// \param __a
352 /// A 128-bit vector of [4 x float] containing one of the operands.
353 /// \param __b
354 /// A 128-bit vector of [4 x float] containing one of the operands.
355 /// \returns A 128-bit vector of [4 x float] containing the minimum values
356 /// between both operands.
357 static __inline__ __m128 __DEFAULT_FN_ATTRS
358 _mm_min_ps(__m128 __a, __m128 __b)
359 {
360  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
361 }
362 
363 /// \brief Compares two 32-bit float values in the low-order bits of both
364 /// operands and returns the greater value in the low-order bits of a 128-bit
365 /// vector of [4 x float].
366 ///
367 /// \headerfile <x86intrin.h>
368 ///
369 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
370 ///
371 /// \param __a
372 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
373 /// 32 bits of this operand are used in the comparison.
374 /// \param __b
375 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
376 /// 32 bits of this operand are used in the comparison.
377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378 /// maximum value between both operands. The upper 96 bits are copied from
379 /// the upper 96 bits of the first source operand.
380 static __inline__ __m128 __DEFAULT_FN_ATTRS
381 _mm_max_ss(__m128 __a, __m128 __b)
382 {
383  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384 }
385 
386 /// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
387 /// of each pair of values.
388 ///
389 /// \headerfile <x86intrin.h>
390 ///
391 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
392 ///
393 /// \param __a
394 /// A 128-bit vector of [4 x float] containing one of the operands.
395 /// \param __b
396 /// A 128-bit vector of [4 x float] containing one of the operands.
397 /// \returns A 128-bit vector of [4 x float] containing the maximum values
398 /// between both operands.
399 static __inline__ __m128 __DEFAULT_FN_ATTRS
400 _mm_max_ps(__m128 __a, __m128 __b)
401 {
402  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
403 }
404 
405 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
406 ///
407 /// \headerfile <x86intrin.h>
408 ///
409 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
410 ///
411 /// \param __a
412 /// A 128-bit vector containing one of the source operands.
413 /// \param __b
414 /// A 128-bit vector containing one of the source operands.
415 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
416 /// values between both operands.
417 static __inline__ __m128 __DEFAULT_FN_ATTRS
418 _mm_and_ps(__m128 __a, __m128 __b)
419 {
420  return (__m128)((__v4su)__a & (__v4su)__b);
421 }
422 
423 /// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
424 /// the one's complement of the values contained in the first source
425 /// operand.
426 ///
427 /// \headerfile <x86intrin.h>
428 ///
429 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
430 ///
431 /// \param __a
432 /// A 128-bit vector of [4 x float] containing the first source operand. The
433 /// one's complement of this value is used in the bitwise AND.
434 /// \param __b
435 /// A 128-bit vector of [4 x float] containing the second source operand.
436 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
437 /// one's complement of the first operand and the values in the second
438 /// operand.
439 static __inline__ __m128 __DEFAULT_FN_ATTRS
440 _mm_andnot_ps(__m128 __a, __m128 __b)
441 {
442  return (__m128)(~(__v4su)__a & (__v4su)__b);
443 }
444 
445 /// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
446 ///
447 /// \headerfile <x86intrin.h>
448 ///
449 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
450 ///
451 /// \param __a
452 /// A 128-bit vector of [4 x float] containing one of the source operands.
453 /// \param __b
454 /// A 128-bit vector of [4 x float] containing one of the source operands.
455 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
456 /// values between both operands.
457 static __inline__ __m128 __DEFAULT_FN_ATTRS
458 _mm_or_ps(__m128 __a, __m128 __b)
459 {
460  return (__m128)((__v4su)__a | (__v4su)__b);
461 }
462 
463 /// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
464 /// [4 x float].
465 ///
466 /// \headerfile <x86intrin.h>
467 ///
468 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
469 ///
470 /// \param __a
471 /// A 128-bit vector of [4 x float] containing one of the source operands.
472 /// \param __b
473 /// A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
475 /// of the values between both operands.
476 static __inline__ __m128 __DEFAULT_FN_ATTRS
477 _mm_xor_ps(__m128 __a, __m128 __b)
478 {
479  return (__m128)((__v4su)__a ^ (__v4su)__b);
480 }
481 
482 /// \brief Compares two 32-bit float values in the low-order bits of both
483 /// operands for equality and returns the result of the comparison in the
484 /// low-order bits of a vector [4 x float].
485 ///
486 /// \headerfile <x86intrin.h>
487 ///
488 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
489 ///
490 /// \param __a
491 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
492 /// 32 bits of this operand are used in the comparison.
493 /// \param __b
494 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
495 /// 32 bits of this operand are used in the comparison.
496 /// \returns A 128-bit vector of [4 x float] containing the comparison results
497 /// in the low-order bits.
498 static __inline__ __m128 __DEFAULT_FN_ATTRS
499 _mm_cmpeq_ss(__m128 __a, __m128 __b)
500 {
501  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
502 }
503 
504 /// \brief Compares each of the corresponding 32-bit float values of the
505 /// 128-bit vectors of [4 x float] for equality.
506 ///
507 /// \headerfile <x86intrin.h>
508 ///
509 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
510 ///
511 /// \param __a
512 /// A 128-bit vector of [4 x float].
513 /// \param __b
514 /// A 128-bit vector of [4 x float].
515 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
516 static __inline__ __m128 __DEFAULT_FN_ATTRS
517 _mm_cmpeq_ps(__m128 __a, __m128 __b)
518 {
519  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
520 }
521 
522 /// \brief Compares two 32-bit float values in the low-order bits of both
523 /// operands to determine if the value in the first operand is less than the
524 /// corresponding value in the second operand and returns the result of the
525 /// comparison in the low-order bits of a vector of [4 x float].
526 ///
527 /// \headerfile <x86intrin.h>
528 ///
529 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
530 ///
531 /// \param __a
532 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
533 /// 32 bits of this operand are used in the comparison.
534 /// \param __b
535 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
536 /// 32 bits of this operand are used in the comparison.
537 /// \returns A 128-bit vector of [4 x float] containing the comparison results
538 /// in the low-order bits.
539 static __inline__ __m128 __DEFAULT_FN_ATTRS
540 _mm_cmplt_ss(__m128 __a, __m128 __b)
541 {
542  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
543 }
544 
545 /// \brief Compares each of the corresponding 32-bit float values of the
546 /// 128-bit vectors of [4 x float] to determine if the values in the first
547 /// operand are less than those in the second operand.
548 ///
549 /// \headerfile <x86intrin.h>
550 ///
551 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
552 ///
553 /// \param __a
554 /// A 128-bit vector of [4 x float].
555 /// \param __b
556 /// A 128-bit vector of [4 x float].
557 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
558 static __inline__ __m128 __DEFAULT_FN_ATTRS
559 _mm_cmplt_ps(__m128 __a, __m128 __b)
560 {
561  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
562 }
563 
564 /// \brief Compares two 32-bit float values in the low-order bits of both
565 /// operands to determine if the value in the first operand is less than or
566 /// equal to the corresponding value in the second operand and returns the
567 /// result of the comparison in the low-order bits of a vector of
568 /// [4 x float].
569 ///
570 /// \headerfile <x86intrin.h>
571 ///
572 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
573 ///
574 /// \param __a
575 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
576 /// 32 bits of this operand are used in the comparison.
577 /// \param __b
578 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
579 /// 32 bits of this operand are used in the comparison.
580 /// \returns A 128-bit vector of [4 x float] containing the comparison results
581 /// in the low-order bits.
582 static __inline__ __m128 __DEFAULT_FN_ATTRS
583 _mm_cmple_ss(__m128 __a, __m128 __b)
584 {
585  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
586 }
587 
588 /// \brief Compares each of the corresponding 32-bit float values of the
589 /// 128-bit vectors of [4 x float] to determine if the values in the first
590 /// operand are less than or equal to those in the second operand.
591 ///
592 /// \headerfile <x86intrin.h>
593 ///
594 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
595 ///
596 /// \param __a
597 /// A 128-bit vector of [4 x float].
598 /// \param __b
599 /// A 128-bit vector of [4 x float].
600 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
601 static __inline__ __m128 __DEFAULT_FN_ATTRS
602 _mm_cmple_ps(__m128 __a, __m128 __b)
603 {
604  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
605 }
606 
607 /// \brief Compares two 32-bit float values in the low-order bits of both
608 /// operands to determine if the value in the first operand is greater than
609 /// the corresponding value in the second operand and returns the result of
610 /// the comparison in the low-order bits of a vector of [4 x float].
611 ///
612 /// \headerfile <x86intrin.h>
613 ///
614 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
615 ///
616 /// \param __a
617 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
618 /// 32 bits of this operand are used in the comparison.
619 /// \param __b
620 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
621 /// 32 bits of this operand are used in the comparison.
622 /// \returns A 128-bit vector of [4 x float] containing the comparison results
623 /// in the low-order bits.
624 static __inline__ __m128 __DEFAULT_FN_ATTRS
625 _mm_cmpgt_ss(__m128 __a, __m128 __b)
626 {
627  return (__m128)__builtin_shufflevector((__v4sf)__a,
628  (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
629  4, 1, 2, 3);
630 }
631 
632 /// \brief Compares each of the corresponding 32-bit float values of the
633 /// 128-bit vectors of [4 x float] to determine if the values in the first
634 /// operand are greater than those in the second operand.
635 ///
636 /// \headerfile <x86intrin.h>
637 ///
638 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
639 ///
640 /// \param __a
641 /// A 128-bit vector of [4 x float].
642 /// \param __b
643 /// A 128-bit vector of [4 x float].
644 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
645 static __inline__ __m128 __DEFAULT_FN_ATTRS
646 _mm_cmpgt_ps(__m128 __a, __m128 __b)
647 {
648  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
649 }
650 
651 /// \brief Compares two 32-bit float values in the low-order bits of both
652 /// operands to determine if the value in the first operand is greater than
653 /// or equal to the corresponding value in the second operand and returns
654 /// the result of the comparison in the low-order bits of a vector of
655 /// [4 x float].
656 ///
657 /// \headerfile <x86intrin.h>
658 ///
659 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
660 ///
661 /// \param __a
662 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
663 /// 32 bits of this operand are used in the comparison.
664 /// \param __b
665 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
666 /// 32 bits of this operand are used in the comparison.
667 /// \returns A 128-bit vector of [4 x float] containing the comparison results
668 /// in the low-order bits.
669 static __inline__ __m128 __DEFAULT_FN_ATTRS
670 _mm_cmpge_ss(__m128 __a, __m128 __b)
671 {
672  return (__m128)__builtin_shufflevector((__v4sf)__a,
673  (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
674  4, 1, 2, 3);
675 }
676 
677 /// \brief Compares each of the corresponding 32-bit float values of the
678 /// 128-bit vectors of [4 x float] to determine if the values in the first
679 /// operand are greater than or equal to those in the second operand.
680 ///
681 /// \headerfile <x86intrin.h>
682 ///
683 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
684 ///
685 /// \param __a
686 /// A 128-bit vector of [4 x float].
687 /// \param __b
688 /// A 128-bit vector of [4 x float].
689 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
690 static __inline__ __m128 __DEFAULT_FN_ATTRS
691 _mm_cmpge_ps(__m128 __a, __m128 __b)
692 {
693  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
694 }
695 
696 /// \brief Compares two 32-bit float values in the low-order bits of both
697 /// operands for inequality and returns the result of the comparison in the
698 /// low-order bits of a vector of [4 x float].
699 ///
700 /// \headerfile <x86intrin.h>
701 ///
702 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
703 /// instructions.
704 ///
705 /// \param __a
706 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
707 /// 32 bits of this operand are used in the comparison.
708 /// \param __b
709 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
710 /// 32 bits of this operand are used in the comparison.
711 /// \returns A 128-bit vector of [4 x float] containing the comparison results
712 /// in the low-order bits.
713 static __inline__ __m128 __DEFAULT_FN_ATTRS
714 _mm_cmpneq_ss(__m128 __a, __m128 __b)
715 {
716  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
717 }
718 
719 /// \brief Compares each of the corresponding 32-bit float values of the
720 /// 128-bit vectors of [4 x float] for inequality.
721 ///
722 /// \headerfile <x86intrin.h>
723 ///
724 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
725 /// instructions.
726 ///
727 /// \param __a
728 /// A 128-bit vector of [4 x float].
729 /// \param __b
730 /// A 128-bit vector of [4 x float].
731 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
732 static __inline__ __m128 __DEFAULT_FN_ATTRS
733 _mm_cmpneq_ps(__m128 __a, __m128 __b)
734 {
735  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
736 }
737 
738 /// \brief Compares two 32-bit float values in the low-order bits of both
739 /// operands to determine if the value in the first operand is not less than
740 /// the corresponding value in the second operand and returns the result of
741 /// the comparison in the low-order bits of a vector of [4 x float].
742 ///
743 /// \headerfile <x86intrin.h>
744 ///
745 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
746 /// instructions.
747 ///
748 /// \param __a
749 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
750 /// 32 bits of this operand are used in the comparison.
751 /// \param __b
752 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
753 /// 32 bits of this operand are used in the comparison.
754 /// \returns A 128-bit vector of [4 x float] containing the comparison results
755 /// in the low-order bits.
756 static __inline__ __m128 __DEFAULT_FN_ATTRS
757 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
758 {
759  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
760 }
761 
762 /// \brief Compares each of the corresponding 32-bit float values of the
763 /// 128-bit vectors of [4 x float] to determine if the values in the first
764 /// operand are not less than those in the second operand.
765 ///
766 /// \headerfile <x86intrin.h>
767 ///
768 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
769 /// instructions.
770 ///
771 /// \param __a
772 /// A 128-bit vector of [4 x float].
773 /// \param __b
774 /// A 128-bit vector of [4 x float].
775 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
776 static __inline__ __m128 __DEFAULT_FN_ATTRS
777 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
778 {
779  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
780 }
781 
782 /// \brief Compares two 32-bit float values in the low-order bits of both
783 /// operands to determine if the value in the first operand is not less than
784 /// or equal to the corresponding value in the second operand and returns
785 /// the result of the comparison in the low-order bits of a vector of
786 /// [4 x float].
787 ///
788 /// \headerfile <x86intrin.h>
789 ///
790 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
791 /// instructions.
792 ///
793 /// \param __a
794 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
795 /// 32 bits of this operand are used in the comparison.
796 /// \param __b
797 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
798 /// 32 bits of this operand are used in the comparison.
799 /// \returns A 128-bit vector of [4 x float] containing the comparison results
800 /// in the low-order bits.
801 static __inline__ __m128 __DEFAULT_FN_ATTRS
802 _mm_cmpnle_ss(__m128 __a, __m128 __b)
803 {
804  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
805 }
806 
807 /// \brief Compares each of the corresponding 32-bit float values of the
808 /// 128-bit vectors of [4 x float] to determine if the values in the first
809 /// operand are not less than or equal to those in the second operand.
810 ///
811 /// \headerfile <x86intrin.h>
812 ///
813 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
814 /// instructions.
815 ///
816 /// \param __a
817 /// A 128-bit vector of [4 x float].
818 /// \param __b
819 /// A 128-bit vector of [4 x float].
820 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
821 static __inline__ __m128 __DEFAULT_FN_ATTRS
822 _mm_cmpnle_ps(__m128 __a, __m128 __b)
823 {
824  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
825 }
826 
827 /// \brief Compares two 32-bit float values in the low-order bits of both
828 /// operands to determine if the value in the first operand is not greater
829 /// than the corresponding value in the second operand and returns the
830 /// result of the comparison in the low-order bits of a vector of
831 /// [4 x float].
832 ///
833 /// \headerfile <x86intrin.h>
834 ///
835 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
836 /// instructions.
837 ///
838 /// \param __a
839 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
840 /// 32 bits of this operand are used in the comparison.
841 /// \param __b
842 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
843 /// 32 bits of this operand are used in the comparison.
844 /// \returns A 128-bit vector of [4 x float] containing the comparison results
845 /// in the low-order bits.
846 static __inline__ __m128 __DEFAULT_FN_ATTRS
847 _mm_cmpngt_ss(__m128 __a, __m128 __b)
848 {
849  return (__m128)__builtin_shufflevector((__v4sf)__a,
850  (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
851  4, 1, 2, 3);
852 }
853 
854 /// \brief Compares each of the corresponding 32-bit float values of the
855 /// 128-bit vectors of [4 x float] to determine if the values in the first
856 /// operand are not greater than those in the second operand.
857 ///
858 /// \headerfile <x86intrin.h>
859 ///
860 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
861 /// instructions.
862 ///
863 /// \param __a
864 /// A 128-bit vector of [4 x float].
865 /// \param __b
866 /// A 128-bit vector of [4 x float].
867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
868 static __inline__ __m128 __DEFAULT_FN_ATTRS
869 _mm_cmpngt_ps(__m128 __a, __m128 __b)
870 {
871  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
872 }
873 
874 /// \brief Compares two 32-bit float values in the low-order bits of both
875 /// operands to determine if the value in the first operand is not greater
876 /// than or equal to the corresponding value in the second operand and
877 /// returns the result of the comparison in the low-order bits of a vector
878 /// of [4 x float].
879 ///
880 /// \headerfile <x86intrin.h>
881 ///
882 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
883 /// instructions.
884 ///
885 /// \param __a
886 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
887 /// 32 bits of this operand are used in the comparison.
888 /// \param __b
889 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
890 /// 32 bits of this operand are used in the comparison.
891 /// \returns A 128-bit vector of [4 x float] containing the comparison results
892 /// in the low-order bits.
893 static __inline__ __m128 __DEFAULT_FN_ATTRS
894 _mm_cmpnge_ss(__m128 __a, __m128 __b)
895 {
896  return (__m128)__builtin_shufflevector((__v4sf)__a,
897  (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
898  4, 1, 2, 3);
899 }
900 
901 /// \brief Compares each of the corresponding 32-bit float values of the
902 /// 128-bit vectors of [4 x float] to determine if the values in the first
903 /// operand are not greater than or equal to those in the second operand.
904 ///
905 /// \headerfile <x86intrin.h>
906 ///
907 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
908 /// instructions.
909 ///
910 /// \param __a
911 /// A 128-bit vector of [4 x float].
912 /// \param __b
913 /// A 128-bit vector of [4 x float].
914 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
915 static __inline__ __m128 __DEFAULT_FN_ATTRS
916 _mm_cmpnge_ps(__m128 __a, __m128 __b)
917 {
918  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
919 }
920 
921 /// \brief Compares two 32-bit float values in the low-order bits of both
922 /// operands to determine if the value in the first operand is ordered with
923 /// respect to the corresponding value in the second operand and returns the
924 /// result of the comparison in the low-order bits of a vector of
925 /// [4 x float].
926 ///
927 /// \headerfile <x86intrin.h>
928 ///
929 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
930 /// instructions.
931 ///
932 /// \param __a
933 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
934 /// 32 bits of this operand are used in the comparison.
935 /// \param __b
936 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
937 /// 32 bits of this operand are used in the comparison.
938 /// \returns A 128-bit vector of [4 x float] containing the comparison results
939 /// in the low-order bits.
940 static __inline__ __m128 __DEFAULT_FN_ATTRS
941 _mm_cmpord_ss(__m128 __a, __m128 __b)
942 {
943  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
944 }
945 
946 /// \brief Compares each of the corresponding 32-bit float values of the
947 /// 128-bit vectors of [4 x float] to determine if the values in the first
948 /// operand are ordered with respect to those in the second operand.
949 ///
950 /// \headerfile <x86intrin.h>
951 ///
952 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
953 /// instructions.
954 ///
955 /// \param __a
956 /// A 128-bit vector of [4 x float].
957 /// \param __b
958 /// A 128-bit vector of [4 x float].
959 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
960 static __inline__ __m128 __DEFAULT_FN_ATTRS
961 _mm_cmpord_ps(__m128 __a, __m128 __b)
962 {
963  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
964 }
965 
966 /// \brief Compares two 32-bit float values in the low-order bits of both
967 /// operands to determine if the value in the first operand is unordered
968 /// with respect to the corresponding value in the second operand and
969 /// returns the result of the comparison in the low-order bits of a vector
970 /// of [4 x float].
971 ///
972 /// \headerfile <x86intrin.h>
973 ///
974 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
975 /// instructions.
976 ///
977 /// \param __a
978 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
979 /// 32 bits of this operand are used in the comparison.
980 /// \param __b
981 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
982 /// 32 bits of this operand are used in the comparison.
983 /// \returns A 128-bit vector of [4 x float] containing the comparison results
984 /// in the low-order bits.
985 static __inline__ __m128 __DEFAULT_FN_ATTRS
986 _mm_cmpunord_ss(__m128 __a, __m128 __b)
987 {
988  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
989 }
990 
991 /// \brief Compares each of the corresponding 32-bit float values of the
992 /// 128-bit vectors of [4 x float] to determine if the values in the first
993 /// operand are unordered with respect to those in the second operand.
994 ///
995 /// \headerfile <x86intrin.h>
996 ///
997 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
998 /// instructions.
999 ///
1000 /// \param __a
1001 /// A 128-bit vector of [4 x float].
1002 /// \param __b
1003 /// A 128-bit vector of [4 x float].
1004 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1005 static __inline__ __m128 __DEFAULT_FN_ATTRS
1006 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1007 {
1008  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1009 }
1010 
1011 /// \brief Compares two 32-bit float values in the low-order bits of both
1012 /// operands for equality and returns the result of the comparison.
1013 ///
1014 /// \headerfile <x86intrin.h>
1015 ///
1016 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1017 /// instructions.
1018 ///
1019 /// \param __a
1020 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1021 /// used in the comparison.
1022 /// \param __b
1023 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1024 /// used in the comparison.
1025 /// \returns An integer containing the comparison results.
1026 static __inline__ int __DEFAULT_FN_ATTRS
1027 _mm_comieq_ss(__m128 __a, __m128 __b)
1028 {
1029  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1030 }
1031 
1032 /// \brief Compares two 32-bit float values in the low-order bits of both
1033 /// operands to determine if the first operand is less than the second
1034 /// operand and returns the result of the comparison.
1035 ///
1036 /// \headerfile <x86intrin.h>
1037 ///
1038 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1039 /// instructions.
1040 ///
1041 /// \param __a
1042 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1043 /// used in the comparison.
1044 /// \param __b
1045 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1046 /// used in the comparison.
1047 /// \returns An integer containing the comparison results.
1048 static __inline__ int __DEFAULT_FN_ATTRS
1049 _mm_comilt_ss(__m128 __a, __m128 __b)
1050 {
1051  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1052 }
1053 
1054 /// \brief Compares two 32-bit float values in the low-order bits of both
1055 /// operands to determine if the first operand is less than or equal to the
1056 /// second operand and returns the result of the comparison.
1057 ///
1058 /// \headerfile <x86intrin.h>
1059 ///
1060 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1061 ///
1062 /// \param __a
1063 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1064 /// used in the comparison.
1065 /// \param __b
1066 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1067 /// used in the comparison.
1068 /// \returns An integer containing the comparison results.
1069 static __inline__ int __DEFAULT_FN_ATTRS
1070 _mm_comile_ss(__m128 __a, __m128 __b)
1071 {
1072  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1073 }
1074 
1075 /// \brief Compares two 32-bit float values in the low-order bits of both
1076 /// operands to determine if the first operand is greater than the second
1077 /// operand and returns the result of the comparison.
1078 ///
1079 /// \headerfile <x86intrin.h>
1080 ///
1081 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1082 ///
1083 /// \param __a
1084 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 /// used in the comparison.
1086 /// \param __b
1087 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1088 /// used in the comparison.
1089 /// \returns An integer containing the comparison results.
1090 static __inline__ int __DEFAULT_FN_ATTRS
1091 _mm_comigt_ss(__m128 __a, __m128 __b)
1092 {
1093  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1094 }
1095 
1096 /// \brief Compares two 32-bit float values in the low-order bits of both
1097 /// operands to determine if the first operand is greater than or equal to
1098 /// the second operand and returns the result of the comparison.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 /// used in the comparison.
1107 /// \param __b
1108 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 /// used in the comparison.
1110 /// \returns An integer containing the comparison results.
1111 static __inline__ int __DEFAULT_FN_ATTRS
1112 _mm_comige_ss(__m128 __a, __m128 __b)
1113 {
1114  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1115 }
1116 
1117 /// \brief Compares two 32-bit float values in the low-order bits of both
1118 /// operands to determine if the first operand is not equal to the second
1119 /// operand and returns the result of the comparison.
1120 ///
1121 /// \headerfile <x86intrin.h>
1122 ///
1123 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1124 ///
1125 /// \param __a
1126 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1127 /// used in the comparison.
1128 /// \param __b
1129 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 /// used in the comparison.
1131 /// \returns An integer containing the comparison results.
1132 static __inline__ int __DEFAULT_FN_ATTRS
1133 _mm_comineq_ss(__m128 __a, __m128 __b)
1134 {
1135  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1136 }
1137 
1138 /// \brief Performs an unordered comparison of two 32-bit float values using
1139 /// the low-order bits of both operands to determine equality and returns
1140 /// the result of the comparison.
1141 ///
1142 /// \headerfile <x86intrin.h>
1143 ///
1144 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1145 ///
1146 /// \param __a
1147 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1148 /// used in the comparison.
1149 /// \param __b
1150 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1151 /// used in the comparison.
1152 /// \returns An integer containing the comparison results.
1153 static __inline__ int __DEFAULT_FN_ATTRS
1154 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1155 {
1156  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1157 }
1158 
1159 /// \brief Performs an unordered comparison of two 32-bit float values using
1160 /// the low-order bits of both operands to determine if the first operand is
1161 /// less than the second operand and returns the result of the comparison.
1162 ///
1163 /// \headerfile <x86intrin.h>
1164 ///
1165 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1166 ///
1167 /// \param __a
1168 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169 /// used in the comparison.
1170 /// \param __b
1171 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1172 /// used in the comparison.
1173 /// \returns An integer containing the comparison results.
1174 static __inline__ int __DEFAULT_FN_ATTRS
1175 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1176 {
1177  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1178 }
1179 
1180 /// \brief Performs an unordered comparison of two 32-bit float values using
1181 /// the low-order bits of both operands to determine if the first operand is
1182 /// less than or equal to the second operand and returns the result of the
1183 /// comparison.
1184 ///
1185 /// \headerfile <x86intrin.h>
1186 ///
1187 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1188 ///
1189 /// \param __a
1190 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1191 /// used in the comparison.
1192 /// \param __b
1193 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1194 /// used in the comparison.
1195 /// \returns An integer containing the comparison results.
1196 static __inline__ int __DEFAULT_FN_ATTRS
1197 _mm_ucomile_ss(__m128 __a, __m128 __b)
1198 {
1199  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1200 }
1201 
1202 /// \brief Performs an unordered comparison of two 32-bit float values using
1203 /// the low-order bits of both operands to determine if the first operand is
1204 /// greater than the second operand and returns the result of the
1205 /// comparison.
1206 ///
1207 /// \headerfile <x86intrin.h>
1208 ///
1209 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1210 ///
1211 /// \param __a
1212 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1213 /// used in the comparison.
1214 /// \param __b
1215 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1216 /// used in the comparison.
1217 /// \returns An integer containing the comparison results.
1218 static __inline__ int __DEFAULT_FN_ATTRS
1219 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1220 {
1221  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1222 }
1223 
1224 /// \brief Performs an unordered comparison of two 32-bit float values using
1225 /// the low-order bits of both operands to determine if the first operand is
1226 /// greater than or equal to the second operand and returns the result of
1227 /// the comparison.
1228 ///
1229 /// \headerfile <x86intrin.h>
1230 ///
1231 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1232 ///
1233 /// \param __a
1234 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235 /// used in the comparison.
1236 /// \param __b
1237 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1238 /// used in the comparison.
1239 /// \returns An integer containing the comparison results.
1240 static __inline__ int __DEFAULT_FN_ATTRS
1241 _mm_ucomige_ss(__m128 __a, __m128 __b)
1242 {
1243  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1244 }
1245 
1246 /// \brief Performs an unordered comparison of two 32-bit float values using
1247 /// the low-order bits of both operands to determine inequality and returns
1248 /// the result of the comparison.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1253 ///
1254 /// \param __a
1255 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 /// used in the comparison.
1257 /// \param __b
1258 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1259 /// used in the comparison.
1260 /// \returns An integer containing the comparison results.
1261 static __inline__ int __DEFAULT_FN_ATTRS
1262 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1263 {
1264  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1265 }
1266 
1267 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1268 /// [4 x float] into a 32-bit integer.
1269 ///
1270 /// \headerfile <x86intrin.h>
1271 ///
1272 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1273 /// instructions.
1274 ///
1275 /// \param __a
1276 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 /// used in the conversion.
1278 /// \returns A 32-bit integer containing the converted value.
1279 static __inline__ int __DEFAULT_FN_ATTRS
1280 _mm_cvtss_si32(__m128 __a)
1281 {
1282  return __builtin_ia32_cvtss2si((__v4sf)__a);
1283 }
1284 
1285 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1286 /// [4 x float] into a 32-bit integer.
1287 ///
1288 /// \headerfile <x86intrin.h>
1289 ///
1290 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1291 /// instructions.
1292 ///
1293 /// \param __a
1294 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1295 /// used in the conversion.
1296 /// \returns A 32-bit integer containing the converted value.
1297 static __inline__ int __DEFAULT_FN_ATTRS
1298 _mm_cvt_ss2si(__m128 __a)
1299 {
1300  return _mm_cvtss_si32(__a);
1301 }
1302 
1303 #ifdef __x86_64__
1304 
1305 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1306 /// [4 x float] into a 64-bit integer.
1307 ///
1308 /// \headerfile <x86intrin.h>
1309 ///
1310 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1311 /// instructions.
1312 ///
1313 /// \param __a
1314 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1315 /// used in the conversion.
1316 /// \returns A 64-bit integer containing the converted value.
1317 static __inline__ long long __DEFAULT_FN_ATTRS
1318 _mm_cvtss_si64(__m128 __a)
1319 {
1320  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1321 }
1322 
1323 #endif
1324 
1325 /// \brief Converts two low-order float values in a 128-bit vector of
1326 /// [4 x float] into a 64-bit vector of [2 x i32].
1327 ///
1328 /// \headerfile <x86intrin.h>
1329 ///
1330 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1331 ///
1332 /// \param __a
1333 /// A 128-bit vector of [4 x float].
1334 /// \returns A 64-bit integer vector containing the converted values.
1335 static __inline__ __m64 __DEFAULT_FN_ATTRS
1336 _mm_cvtps_pi32(__m128 __a)
1337 {
1338  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1339 }
1340 
1341 /// \brief Converts two low-order float values in a 128-bit vector of
1342 /// [4 x float] into a 64-bit vector of [2 x i32].
1343 ///
1344 /// \headerfile <x86intrin.h>
1345 ///
1346 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1347 ///
1348 /// \param __a
1349 /// A 128-bit vector of [4 x float].
1350 /// \returns A 64-bit integer vector containing the converted values.
1351 static __inline__ __m64 __DEFAULT_FN_ATTRS
1352 _mm_cvt_ps2pi(__m128 __a)
1353 {
1354  return _mm_cvtps_pi32(__a);
1355 }
1356 
1357 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1358 /// [4 x float] into a 32-bit integer, truncating the result when it is
1359 /// inexact.
1360 ///
1361 /// \headerfile <x86intrin.h>
1362 ///
1363 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1364 /// instructions.
1365 ///
1366 /// \param __a
1367 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1368 /// used in the conversion.
1369 /// \returns A 32-bit integer containing the converted value.
1370 static __inline__ int __DEFAULT_FN_ATTRS
1371 _mm_cvttss_si32(__m128 __a)
1372 {
1373  return __builtin_ia32_cvttss2si((__v4sf)__a);
1374 }
1375 
1376 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1377 /// [4 x float] into a 32-bit integer, truncating the result when it is
1378 /// inexact.
1379 ///
1380 /// \headerfile <x86intrin.h>
1381 ///
1382 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1383 /// instructions.
1384 ///
1385 /// \param __a
1386 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1387 /// used in the conversion.
1388 /// \returns A 32-bit integer containing the converted value.
1389 static __inline__ int __DEFAULT_FN_ATTRS
1390 _mm_cvtt_ss2si(__m128 __a)
1391 {
1392  return _mm_cvttss_si32(__a);
1393 }
1394 
1395 #ifdef __x86_64__
1396 /// \brief Converts a float value contained in the lower 32 bits of a vector of
1397 /// [4 x float] into a 64-bit integer, truncating the result when it is
1398 /// inexact.
1399 ///
1400 /// \headerfile <x86intrin.h>
1401 ///
1402 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1403 /// instructions.
1404 ///
1405 /// \param __a
1406 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1407 /// used in the conversion.
1408 /// \returns A 64-bit integer containing the converted value.
1409 static __inline__ long long __DEFAULT_FN_ATTRS
1410 _mm_cvttss_si64(__m128 __a)
1411 {
1412  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1413 }
1414 #endif
1415 
1416 /// \brief Converts two low-order float values in a 128-bit vector of
1417 /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1418 /// when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1423 /// instructions.
1424 ///
1425 /// \param __a
1426 /// A 128-bit vector of [4 x float].
1427 /// \returns A 64-bit integer vector containing the converted values.
1428 static __inline__ __m64 __DEFAULT_FN_ATTRS
1429 _mm_cvttps_pi32(__m128 __a)
1430 {
1431  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1432 }
1433 
1434 /// \brief Converts two low-order float values in a 128-bit vector of [4 x
1435 /// float] into a 64-bit vector of [2 x i32], truncating the result when it
1436 /// is inexact.
1437 ///
1438 /// \headerfile <x86intrin.h>
1439 ///
1440 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1441 ///
1442 /// \param __a
1443 /// A 128-bit vector of [4 x float].
1444 /// \returns A 64-bit integer vector containing the converted values.
1445 static __inline__ __m64 __DEFAULT_FN_ATTRS
1446 _mm_cvtt_ps2pi(__m128 __a)
1447 {
1448  return _mm_cvttps_pi32(__a);
1449 }
1450 
1451 /// \brief Converts a 32-bit signed integer value into a floating point value
1452 /// and writes it to the lower 32 bits of the destination. The remaining
1453 /// higher order elements of the destination vector are copied from the
1454 /// corresponding elements in the first operand.
1455 ///
1456 /// \headerfile <x86intrin.h>
1457 ///
1458 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1459 ///
1460 /// \param __a
1461 /// A 128-bit vector of [4 x float].
1462 /// \param __b
1463 /// A 32-bit signed integer operand containing the value to be converted.
1464 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1465 /// converted value of the second operand. The upper 96 bits are copied from
1466 /// the upper 96 bits of the first operand.
1467 static __inline__ __m128 __DEFAULT_FN_ATTRS
1468 _mm_cvtsi32_ss(__m128 __a, int __b)
1469 {
1470  __a[0] = __b;
1471  return __a;
1472 }
1473 
1474 /// \brief Converts a 32-bit signed integer value into a floating point value
1475 /// and writes it to the lower 32 bits of the destination. The remaining
1476 /// higher order elements of the destination are copied from the
1477 /// corresponding elements in the first operand.
1478 ///
1479 /// \headerfile <x86intrin.h>
1480 ///
1481 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1482 ///
1483 /// \param __a
1484 /// A 128-bit vector of [4 x float].
1485 /// \param __b
1486 /// A 32-bit signed integer operand containing the value to be converted.
1487 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1488 /// converted value of the second operand. The upper 96 bits are copied from
1489 /// the upper 96 bits of the first operand.
1490 static __inline__ __m128 __DEFAULT_FN_ATTRS
1491 _mm_cvt_si2ss(__m128 __a, int __b)
1492 {
1493  return _mm_cvtsi32_ss(__a, __b);
1494 }
1495 
1496 #ifdef __x86_64__
1497 
1498 /// \brief Converts a 64-bit signed integer value into a floating point value
1499 /// and writes it to the lower 32 bits of the destination. The remaining
1500 /// higher order elements of the destination are copied from the
1501 /// corresponding elements in the first operand.
1502 ///
1503 /// \headerfile <x86intrin.h>
1504 ///
1505 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1506 ///
1507 /// \param __a
1508 /// A 128-bit vector of [4 x float].
1509 /// \param __b
1510 /// A 64-bit signed integer operand containing the value to be converted.
1511 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1512 /// converted value of the second operand. The upper 96 bits are copied from
1513 /// the upper 96 bits of the first operand.
1514 static __inline__ __m128 __DEFAULT_FN_ATTRS
1515 _mm_cvtsi64_ss(__m128 __a, long long __b)
1516 {
1517  __a[0] = __b;
1518  return __a;
1519 }
1520 
1521 #endif
1522 
1523 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1524 /// floating point values and writes them to the lower 64-bits of the
1525 /// destination. The remaining higher order elements of the destination are
1526 /// copied from the corresponding elements in the first operand.
1527 ///
1528 /// \headerfile <x86intrin.h>
1529 ///
1530 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1531 ///
1532 /// \param __a
1533 /// A 128-bit vector of [4 x float].
1534 /// \param __b
1535 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1536 /// and written to the corresponding low-order elements in the destination.
1537 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1538 /// converted value of the second operand. The upper 64 bits are copied from
1539 /// the upper 64 bits of the first operand.
1540 static __inline__ __m128 __DEFAULT_FN_ATTRS
1541 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1542 {
1543  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1544 }
1545 
1546 /// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
1547 /// floating point values and writes them to the lower 64-bits of the
1548 /// destination. The remaining higher order elements of the destination are
1549 /// copied from the corresponding elements in the first operand.
1550 ///
1551 /// \headerfile <x86intrin.h>
1552 ///
1553 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1554 ///
1555 /// \param __a
1556 /// A 128-bit vector of [4 x float].
1557 /// \param __b
1558 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1559 /// and written to the corresponding low-order elements in the destination.
1560 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1561 /// converted value from the second operand. The upper 64 bits are copied
1562 /// from the upper 64 bits of the first operand.
1563 static __inline__ __m128 __DEFAULT_FN_ATTRS
1564 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1565 {
1566  return _mm_cvtpi32_ps(__a, __b);
1567 }
1568 
1569 /// \brief Extracts a float value contained in the lower 32 bits of a vector of
1570 /// [4 x float].
1571 ///
1572 /// \headerfile <x86intrin.h>
1573 ///
1574 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1575 ///
1576 /// \param __a
1577 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1578 /// used in the extraction.
1579 /// \returns A 32-bit float containing the extracted value.
1580 static __inline__ float __DEFAULT_FN_ATTRS
1581 _mm_cvtss_f32(__m128 __a)
1582 {
1583  return __a[0];
1584 }
1585 
1586 /// \brief Loads two packed float values from the address \a __p into the
1587 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1588 /// are copied from the low-order bits of the first operand.
1589 ///
1590 /// \headerfile <x86intrin.h>
1591 ///
1592 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1593 ///
1594 /// \param __a
1595 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1596 /// of the destination.
1597 /// \param __p
1598 /// A pointer to two packed float values. Bits [63:0] are written to bits
1599 /// [127:64] of the destination.
1600 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1601 static __inline__ __m128 __DEFAULT_FN_ATTRS
1602 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1603 {
1604  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1605  struct __mm_loadh_pi_struct {
1606  __mm_loadh_pi_v2f32 __u;
1607  } __attribute__((__packed__, __may_alias__));
1608  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1609  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1610  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1611 }
1612 
1613 /// \brief Loads two packed float values from the address \a __p into the
1614 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1615 /// are copied from the high-order bits of the first operand.
1616 ///
1617 /// \headerfile <x86intrin.h>
1618 ///
1619 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1620 ///
1621 /// \param __a
1622 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1623 /// [127:64] of the destination.
1624 /// \param __p
1625 /// A pointer to two packed float values. Bits [63:0] are written to bits
1626 /// [63:0] of the destination.
1627 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1628 static __inline__ __m128 __DEFAULT_FN_ATTRS
1629 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1630 {
1631  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1632  struct __mm_loadl_pi_struct {
1633  __mm_loadl_pi_v2f32 __u;
1634  } __attribute__((__packed__, __may_alias__));
1635  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1636  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1637  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1638 }
1639 
1640 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1641 /// 32 bits of the vector are initialized with the single-precision
1642 /// floating-point value loaded from a specified memory location. The upper
1643 /// 96 bits are set to zero.
1644 ///
1645 /// \headerfile <x86intrin.h>
1646 ///
1647 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1648 ///
1649 /// \param __p
1650 /// A pointer to a 32-bit memory location containing a single-precision
1651 /// floating-point value.
1652 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1653 /// lower 32 bits contain the value loaded from the memory location. The
1654 /// upper 96 bits are set to zero.
1655 static __inline__ __m128 __DEFAULT_FN_ATTRS
1656 _mm_load_ss(const float *__p)
1657 {
1658  struct __mm_load_ss_struct {
1659  float __u;
1660  } __attribute__((__packed__, __may_alias__));
1661  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1662  return (__m128){ __u, 0, 0, 0 };
1663 }
1664 
1665 /// \brief Loads a 32-bit float value and duplicates it to all four vector
1666 /// elements of a 128-bit vector of [4 x float].
1667 ///
1668 /// \headerfile <x86intrin.h>
1669 ///
1670 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
1671 /// instruction.
1672 ///
1673 /// \param __p
1674 /// A pointer to a float value to be loaded and duplicated.
1675 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1676 /// duplicated values.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load1_ps(const float *__p)
1679 {
1680  struct __mm_load1_ps_struct {
1681  float __u;
1682  } __attribute__((__packed__, __may_alias__));
1683  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1684  return (__m128){ __u, __u, __u, __u };
1685 }
1686 
1687 #define _mm_load_ps1(p) _mm_load1_ps(p)
1688 
1689 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
1690 /// memory location.
1691 ///
1692 /// \headerfile <x86intrin.h>
1693 ///
1694 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1695 ///
1696 /// \param __p
1697 /// A pointer to a 128-bit memory location. The address of the memory
1698 /// location has to be 128-bit aligned.
1699 /// \returns A 128-bit vector of [4 x float] containing the loaded valus.
1700 static __inline__ __m128 __DEFAULT_FN_ATTRS
1701 _mm_load_ps(const float *__p)
1702 {
1703  return *(__m128*)__p;
1704 }
1705 
1706 /// \brief Loads a 128-bit floating-point vector of [4 x float] from an
1707 /// unaligned memory location.
1708 ///
1709 /// \headerfile <x86intrin.h>
1710 ///
1711 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1712 ///
1713 /// \param __p
1714 /// A pointer to a 128-bit memory location. The address of the memory
1715 /// location does not have to be aligned.
1716 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1717 static __inline__ __m128 __DEFAULT_FN_ATTRS
1718 _mm_loadu_ps(const float *__p)
1719 {
1720  struct __loadu_ps {
1721  __m128 __v;
1722  } __attribute__((__packed__, __may_alias__));
1723  return ((struct __loadu_ps*)__p)->__v;
1724 }
1725 
1726 /// \brief Loads four packed float values, in reverse order, from an aligned
1727 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1728 ///
1729 /// \headerfile <x86intrin.h>
1730 ///
1731 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1732 /// instruction.
1733 ///
1734 /// \param __p
1735 /// A pointer to a 128-bit memory location. The address of the memory
1736 /// location has to be 128-bit aligned.
1737 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1738 /// in reverse order.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadr_ps(const float *__p)
1741 {
1742  __m128 __a = _mm_load_ps(__p);
1743  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1744 }
1745 
1746 /// \brief Create a 128-bit vector of [4 x float] with undefined values.
1747 ///
1748 /// \headerfile <x86intrin.h>
1749 ///
1750 /// This intrinsic has no corresponding instruction.
1751 ///
1752 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1753 static __inline__ __m128 __DEFAULT_FN_ATTRS
1755 {
1756  return (__m128)__builtin_ia32_undef128();
1757 }
1758 
1759 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
1760 /// 32 bits of the vector are initialized with the specified single-precision
1761 /// floating-point value. The upper 96 bits are set to zero.
1762 ///
1763 /// \headerfile <x86intrin.h>
1764 ///
1765 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1766 ///
1767 /// \param __w
1768 /// A single-precision floating-point value used to initialize the lower 32
1769 /// bits of the result.
1770 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1771 /// lower 32 bits contain the value provided in the source operand. The
1772 /// upper 96 bits are set to zero.
1773 static __inline__ __m128 __DEFAULT_FN_ATTRS
1774 _mm_set_ss(float __w)
1775 {
1776  return (__m128){ __w, 0, 0, 0 };
1777 }
1778 
1779 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1780 /// of the four single-precision floating-point vector elements set to the
1781 /// specified single-precision floating-point value.
1782 ///
1783 /// \headerfile <x86intrin.h>
1784 ///
1785 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1786 ///
1787 /// \param __w
1788 /// A single-precision floating-point value used to initialize each vector
1789 /// element of the result.
1790 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1791 static __inline__ __m128 __DEFAULT_FN_ATTRS
1792 _mm_set1_ps(float __w)
1793 {
1794  return (__m128){ __w, __w, __w, __w };
1795 }
1796 
1797 /* Microsoft specific. */
1798 /// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
1799 /// of the four single-precision floating-point vector elements set to the
1800 /// specified single-precision floating-point value.
1801 ///
1802 /// \headerfile <x86intrin.h>
1803 ///
1804 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1805 ///
1806 /// \param __w
1807 /// A single-precision floating-point value used to initialize each vector
1808 /// element of the result.
1809 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1810 static __inline__ __m128 __DEFAULT_FN_ATTRS
1811 _mm_set_ps1(float __w)
1812 {
1813  return _mm_set1_ps(__w);
1814 }
1815 
1816 /// \brief Constructs a 128-bit floating-point vector of [4 x float]
1817 /// initialized with the specified single-precision floating-point values.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic is a utility function and does not correspond to a specific
1822 /// instruction.
1823 ///
1824 /// \param __z
1825 /// A single-precision floating-point value used to initialize bits [127:96]
1826 /// of the result.
1827 /// \param __y
1828 /// A single-precision floating-point value used to initialize bits [95:64]
1829 /// of the result.
1830 /// \param __x
1831 /// A single-precision floating-point value used to initialize bits [63:32]
1832 /// of the result.
1833 /// \param __w
1834 /// A single-precision floating-point value used to initialize bits [31:0]
1835 /// of the result.
1836 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1837 static __inline__ __m128 __DEFAULT_FN_ATTRS
1838 _mm_set_ps(float __z, float __y, float __x, float __w)
1839 {
1840  return (__m128){ __w, __x, __y, __z };
1841 }
1842 
1843 /// \brief Constructs a 128-bit floating-point vector of [4 x float],
1844 /// initialized in reverse order with the specified 32-bit single-precision
1845 /// float-point values.
1846 ///
1847 /// \headerfile <x86intrin.h>
1848 ///
1849 /// This intrinsic is a utility function and does not correspond to a specific
1850 /// instruction.
1851 ///
1852 /// \param __z
1853 /// A single-precision floating-point value used to initialize bits [31:0]
1854 /// of the result.
1855 /// \param __y
1856 /// A single-precision floating-point value used to initialize bits [63:32]
1857 /// of the result.
1858 /// \param __x
1859 /// A single-precision floating-point value used to initialize bits [95:64]
1860 /// of the result.
1861 /// \param __w
1862 /// A single-precision floating-point value used to initialize bits [127:96]
1863 /// of the result.
1864 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1865 static __inline__ __m128 __DEFAULT_FN_ATTRS
1866 _mm_setr_ps(float __z, float __y, float __x, float __w)
1867 {
1868  return (__m128){ __z, __y, __x, __w };
1869 }
1870 
1871 /// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
1872 /// to zero.
1873 ///
1874 /// \headerfile <x86intrin.h>
1875 ///
1876 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1877 ///
1878 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1879 /// all elements set to zero.
1880 static __inline__ __m128 __DEFAULT_FN_ATTRS
1882 {
1883  return (__m128){ 0, 0, 0, 0 };
1884 }
1885 
1886 /// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1887 /// memory location.
1888 ///
1889 /// \headerfile <x86intrin.h>
1890 ///
1891 /// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
1892 ///
1893 /// \param __p
1894 /// A pointer to a 64-bit memory location.
1895 /// \param __a
1896 /// A 128-bit vector of [4 x float] containing the values to be stored.
1897 static __inline__ void __DEFAULT_FN_ATTRS
1898 _mm_storeh_pi(__m64 *__p, __m128 __a)
1899 {
1900  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1901 }
1902 
1903 /// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1904 /// memory location.
1905 ///
1906 /// \headerfile <x86intrin.h>
1907 ///
1908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1909 ///
1910 /// \param __p
1911 /// A pointer to a memory location that will receive the float values.
1912 /// \param __a
1913 /// A 128-bit vector of [4 x float] containing the values to be stored.
1914 static __inline__ void __DEFAULT_FN_ATTRS
1915 _mm_storel_pi(__m64 *__p, __m128 __a)
1916 {
1917  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1918 }
1919 
1920 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1921 /// memory location.
1922 ///
1923 /// \headerfile <x86intrin.h>
1924 ///
1925 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1926 ///
1927 /// \param __p
1928 /// A pointer to a 32-bit memory location.
1929 /// \param __a
1930 /// A 128-bit vector of [4 x float] containing the value to be stored.
1931 static __inline__ void __DEFAULT_FN_ATTRS
1932 _mm_store_ss(float *__p, __m128 __a)
1933 {
1934  struct __mm_store_ss_struct {
1935  float __u;
1936  } __attribute__((__packed__, __may_alias__));
1937  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1938 }
1939 
1940 /// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
1941 /// location.
1942 ///
1943 /// \headerfile <x86intrin.h>
1944 ///
1945 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1946 ///
1947 /// \param __p
1948 /// A pointer to a 128-bit memory location. The address of the memory
1949 /// location does not have to be aligned.
1950 /// \param __a
1951 /// A 128-bit vector of [4 x float] containing the values to be stored.
1952 static __inline__ void __DEFAULT_FN_ATTRS
1953 _mm_storeu_ps(float *__p, __m128 __a)
1954 {
1955  struct __storeu_ps {
1956  __m128 __v;
1957  } __attribute__((__packed__, __may_alias__));
1958  ((struct __storeu_ps*)__p)->__v = __a;
1959 }
1960 
1961 /// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
1962 /// location.
1963 ///
1964 /// \headerfile <x86intrin.h>
1965 ///
1966 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1967 ///
1968 /// \param __p
1969 /// A pointer to a 128-bit memory location. The address of the memory
1970 /// location has to be 16-byte aligned.
1971 /// \param __a
1972 /// A 128-bit vector of [4 x float] containing the values to be stored.
1973 static __inline__ void __DEFAULT_FN_ATTRS
1974 _mm_store_ps(float *__p, __m128 __a)
1975 {
1976  *(__m128*)__p = __a;
1977 }
1978 
1979 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
1980 /// four contiguous elements in an aligned memory location.
1981 ///
1982 /// \headerfile <x86intrin.h>
1983 ///
1984 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
1985 /// instruction.
1986 ///
1987 /// \param __p
1988 /// A pointer to a 128-bit memory location.
1989 /// \param __a
1990 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
1991 /// of the four contiguous elements pointed by \a __p.
1992 static __inline__ void __DEFAULT_FN_ATTRS
1993 _mm_store1_ps(float *__p, __m128 __a)
1994 {
1995  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1996  _mm_store_ps(__p, __a);
1997 }
1998 
1999 /// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2000 /// four contiguous elements in an aligned memory location.
2001 ///
2002 /// \headerfile <x86intrin.h>
2003 ///
2004 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2005 /// instruction.
2006 ///
2007 /// \param __p
2008 /// A pointer to a 128-bit memory location.
2009 /// \param __a
2010 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2011 /// of the four contiguous elements pointed by \a __p.
2012 static __inline__ void __DEFAULT_FN_ATTRS
2013 _mm_store_ps1(float *__p, __m128 __a)
2014 {
2015  return _mm_store1_ps(__p, __a);
2016 }
2017 
2018 /// \brief Stores float values from a 128-bit vector of [4 x float] to an
2019 /// aligned memory location in reverse order.
2020 ///
2021 /// \headerfile <x86intrin.h>
2022 ///
2023 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2024 /// instruction.
2025 ///
2026 /// \param __p
2027 /// A pointer to a 128-bit memory location. The address of the memory
2028 /// location has to be 128-bit aligned.
2029 /// \param __a
2030 /// A 128-bit vector of [4 x float] containing the values to be stored.
2031 static __inline__ void __DEFAULT_FN_ATTRS
2032 _mm_storer_ps(float *__p, __m128 __a)
2033 {
2034  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2035  _mm_store_ps(__p, __a);
2036 }
2037 
2038 #define _MM_HINT_T0 3
2039 #define _MM_HINT_T1 2
2040 #define _MM_HINT_T2 1
2041 #define _MM_HINT_NTA 0
2042 
2043 #ifndef _MSC_VER
2044 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2045  Sema doesn't do any form of constant propagation yet. */
2046 
2047 /// \brief Loads one cache line of data from the specified address to a location
2048 /// closer to the processor.
2049 ///
2050 /// \headerfile <x86intrin.h>
2051 ///
2052 /// \code
2053 /// void _mm_prefetch(const void * a, const int sel);
2054 /// \endcode
2055 ///
2056 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2057 ///
2058 /// \param a
2059 /// A pointer to a memory location containing a cache line of data.
2060 /// \param sel
2061 /// A predefined integer constant specifying the type of prefetch
2062 /// operation: \n
2063 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2064 /// PREFETCHNTA instruction will be generated. \n
2065 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2066 /// be generated. \n
2067 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2068 /// be generated. \n
2069 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2070 /// be generated.
2071 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
2072 #endif
2073 
2074 /// \brief Stores a 64-bit integer in the specified aligned memory location. To
2075 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2076 /// used again soon).
2077 ///
2078 /// \headerfile <x86intrin.h>
2079 ///
2080 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2081 ///
2082 /// \param __p
2083 /// A pointer to an aligned memory location used to store the register value.
2084 /// \param __a
2085 /// A 64-bit integer containing the value to be stored.
2086 static __inline__ void __DEFAULT_FN_ATTRS
2087 _mm_stream_pi(__m64 *__p, __m64 __a)
2088 {
2089  __builtin_ia32_movntq(__p, __a);
2090 }
2091 
2092 /// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
2093 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2094 /// as non-temporal (unlikely to be used again soon).
2095 ///
2096 /// \headerfile <x86intrin.h>
2097 ///
2098 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2099 ///
2100 /// \param __p
2101 /// A pointer to a 128-bit aligned memory location that will receive the
2102 /// single-precision floating-point values.
2103 /// \param __a
2104 /// A 128-bit vector of [4 x float] containing the values to be moved.
2105 static __inline__ void __DEFAULT_FN_ATTRS
2106 _mm_stream_ps(float *__p, __m128 __a)
2107 {
2108  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2109 }
2110 
2111 #if defined(__cplusplus)
2112 extern "C" {
2113 #endif
2114 
2115 /// \brief Forces strong memory ordering (serialization) between store
2116 /// instructions preceding this instruction and store instructions following
2117 /// this instruction, ensuring the system completes all previous stores
2118 /// before executing subsequent stores.
2119 ///
2120 /// \headerfile <x86intrin.h>
2121 ///
2122 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2123 ///
2124 void _mm_sfence(void);
2125 
2126 #if defined(__cplusplus)
2127 } // extern "C"
2128 #endif
2129 
2130 /// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2131 /// returns it, as specified by the immediate integer operand.
2132 ///
2133 /// \headerfile <x86intrin.h>
2134 ///
2135 /// \code
2136 /// int _mm_extract_pi16(__m64 a, int n);
2137 /// \endcode
2138 ///
2139 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2140 ///
2141 /// \param a
2142 /// A 64-bit vector of [4 x i16].
2143 /// \param n
2144 /// An immediate integer operand that determines which bits are extracted: \n
2145 /// 0: Bits [15:0] are copied to the destination. \n
2146 /// 1: Bits [31:16] are copied to the destination. \n
2147 /// 2: Bits [47:32] are copied to the destination. \n
2148 /// 3: Bits [63:48] are copied to the destination.
2149 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2150 #define _mm_extract_pi16(a, n) __extension__ ({ \
2151  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
2152 
2153 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
2154 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2155 /// specified by the immediate operand \a n.
2156 ///
2157 /// \headerfile <x86intrin.h>
2158 ///
2159 /// \code
2160 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2161 /// \endcode
2162 ///
2163 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
2164 ///
2165 /// \param a
2166 /// A 64-bit vector of [4 x i16].
2167 /// \param d
2168 /// An integer. The lower 16-bit value from this operand is written to the
2169 /// destination at the offset specified by operand \a n.
2170 /// \param n
2171 /// An immediate integer operant that determines which the bits to be used
2172 /// in the destination. \n
2173 /// 0: Bits [15:0] are copied to the destination. \n
2174 /// 1: Bits [31:16] are copied to the destination. \n
2175 /// 2: Bits [47:32] are copied to the destination. \n
2176 /// 3: Bits [63:48] are copied to the destination. \n
2177 /// The remaining bits in the destination are copied from the corresponding
2178 /// bits in operand \a a.
2179 /// \returns A 64-bit integer vector containing the copied packed data from the
2180 /// operands.
2181 #define _mm_insert_pi16(a, d, n) __extension__ ({ \
2182  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
2183 
2184 /// \brief Compares each of the corresponding packed 16-bit integer values of
2185 /// the 64-bit integer vectors, and writes the greater value to the
2186 /// corresponding bits in the destination.
2187 ///
2188 /// \headerfile <x86intrin.h>
2189 ///
2190 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2191 ///
2192 /// \param __a
2193 /// A 64-bit integer vector containing one of the source operands.
2194 /// \param __b
2195 /// A 64-bit integer vector containing one of the source operands.
2196 /// \returns A 64-bit integer vector containing the comparison results.
2197 static __inline__ __m64 __DEFAULT_FN_ATTRS
2198 _mm_max_pi16(__m64 __a, __m64 __b)
2199 {
2200  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2201 }
2202 
2203 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2204 /// values of the 64-bit integer vectors, and writes the greater value to the
2205 /// corresponding bits in the destination.
2206 ///
2207 /// \headerfile <x86intrin.h>
2208 ///
2209 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2210 ///
2211 /// \param __a
2212 /// A 64-bit integer vector containing one of the source operands.
2213 /// \param __b
2214 /// A 64-bit integer vector containing one of the source operands.
2215 /// \returns A 64-bit integer vector containing the comparison results.
2216 static __inline__ __m64 __DEFAULT_FN_ATTRS
2217 _mm_max_pu8(__m64 __a, __m64 __b)
2218 {
2219  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2220 }
2221 
2222 /// \brief Compares each of the corresponding packed 16-bit integer values of
2223 /// the 64-bit integer vectors, and writes the lesser value to the
2224 /// corresponding bits in the destination.
2225 ///
2226 /// \headerfile <x86intrin.h>
2227 ///
2228 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2229 ///
2230 /// \param __a
2231 /// A 64-bit integer vector containing one of the source operands.
2232 /// \param __b
2233 /// A 64-bit integer vector containing one of the source operands.
2234 /// \returns A 64-bit integer vector containing the comparison results.
2235 static __inline__ __m64 __DEFAULT_FN_ATTRS
2236 _mm_min_pi16(__m64 __a, __m64 __b)
2237 {
2238  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2239 }
2240 
2241 /// \brief Compares each of the corresponding packed 8-bit unsigned integer
2242 /// values of the 64-bit integer vectors, and writes the lesser value to the
2243 /// corresponding bits in the destination.
2244 ///
2245 /// \headerfile <x86intrin.h>
2246 ///
2247 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2248 ///
2249 /// \param __a
2250 /// A 64-bit integer vector containing one of the source operands.
2251 /// \param __b
2252 /// A 64-bit integer vector containing one of the source operands.
2253 /// \returns A 64-bit integer vector containing the comparison results.
2254 static __inline__ __m64 __DEFAULT_FN_ATTRS
2255 _mm_min_pu8(__m64 __a, __m64 __b)
2256 {
2257  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2258 }
2259 
2260 /// \brief Takes the most significant bit from each 8-bit element in a 64-bit
2261 /// integer vector to create a 16-bit mask value. Zero-extends the value to
2262 /// 32-bit integer and writes it to the destination.
2263 ///
2264 /// \headerfile <x86intrin.h>
2265 ///
2266 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2267 ///
2268 /// \param __a
2269 /// A 64-bit integer vector containing the values with bits to be extracted.
2270 /// \returns The most significant bit from each 8-bit element in the operand,
2271 /// written to bits [15:0].
2272 static __inline__ int __DEFAULT_FN_ATTRS
2274 {
2275  return __builtin_ia32_pmovmskb((__v8qi)__a);
2276 }
2277 
2278 /// \brief Multiplies packed 16-bit unsigned integer values and writes the
2279 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2280 /// the destination.
2281 ///
2282 /// \headerfile <x86intrin.h>
2283 ///
2284 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2285 ///
2286 /// \param __a
2287 /// A 64-bit integer vector containing one of the source operands.
2288 /// \param __b
2289 /// A 64-bit integer vector containing one of the source operands.
2290 /// \returns A 64-bit integer vector containing the products of both operands.
2291 static __inline__ __m64 __DEFAULT_FN_ATTRS
2292 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2293 {
2294  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2295 }
2296 
2297 /// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2298 /// destination, as specified by the immediate value operand.
2299 ///
2300 /// \headerfile <x86intrin.h>
2301 ///
2302 /// \code
2303 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2304 /// \endcode
2305 ///
2306 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2307 ///
2308 /// \param a
2309 /// A 64-bit integer vector containing the values to be shuffled.
2310 /// \param n
2311 /// An immediate value containing an 8-bit value specifying which elements to
2312 /// copy from \a a. The destinations within the 64-bit destination are
2313 /// assigned values as follows: \n
2314 /// Bits [1:0] are used to assign values to bits [15:0] in the
2315 /// destination. \n
2316 /// Bits [3:2] are used to assign values to bits [31:16] in the
2317 /// destination. \n
2318 /// Bits [5:4] are used to assign values to bits [47:32] in the
2319 /// destination. \n
2320 /// Bits [7:6] are used to assign values to bits [63:48] in the
2321 /// destination. \n
2322 /// Bit value assignments: \n
2323 /// 00: assigned from bits [15:0] of \a a. \n
2324 /// 01: assigned from bits [31:16] of \a a. \n
2325 /// 10: assigned from bits [47:32] of \a a. \n
2326 /// 11: assigned from bits [63:48] of \a a.
2327 /// \returns A 64-bit integer vector containing the shuffled values.
2328 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
2329  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
2330 
2331 /// \brief Conditionally copies the values from each 8-bit element in the first
2332 /// 64-bit integer vector operand to the specified memory location, as
2333 /// specified by the most significant bit in the corresponding element in the
2334 /// second 64-bit integer vector operand.
2335 ///
2336 /// To minimize caching, the data is flagged as non-temporal
2337 /// (unlikely to be used again soon).
2338 ///
2339 /// \headerfile <x86intrin.h>
2340 ///
2341 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2342 ///
2343 /// \param __d
2344 /// A 64-bit integer vector containing the values with elements to be copied.
2345 /// \param __n
2346 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2347 /// element determines whether the corresponding element in operand \a __d
2348 /// is copied. If the most significant bit of a given element is 1, the
2349 /// corresponding element in operand \a __d is copied.
2350 /// \param __p
2351 /// A pointer to a 64-bit memory location that will receive the conditionally
2352 /// copied integer values. The address of the memory location does not have
2353 /// to be aligned.
2354 static __inline__ void __DEFAULT_FN_ATTRS
2355 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2356 {
2357  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2358 }
2359 
2360 /// \brief Computes the rounded averages of the packed unsigned 8-bit integer
2361 /// values and writes the averages to the corresponding bits in the
2362 /// destination.
2363 ///
2364 /// \headerfile <x86intrin.h>
2365 ///
2366 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2367 ///
2368 /// \param __a
2369 /// A 64-bit integer vector containing one of the source operands.
2370 /// \param __b
2371 /// A 64-bit integer vector containing one of the source operands.
2372 /// \returns A 64-bit integer vector containing the averages of both operands.
2373 static __inline__ __m64 __DEFAULT_FN_ATTRS
2374 _mm_avg_pu8(__m64 __a, __m64 __b)
2375 {
2376  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2377 }
2378 
2379 /// \brief Computes the rounded averages of the packed unsigned 16-bit integer
2380 /// values and writes the averages to the corresponding bits in the
2381 /// destination.
2382 ///
2383 /// \headerfile <x86intrin.h>
2384 ///
2385 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2386 ///
2387 /// \param __a
2388 /// A 64-bit integer vector containing one of the source operands.
2389 /// \param __b
2390 /// A 64-bit integer vector containing one of the source operands.
2391 /// \returns A 64-bit integer vector containing the averages of both operands.
2392 static __inline__ __m64 __DEFAULT_FN_ATTRS
2393 _mm_avg_pu16(__m64 __a, __m64 __b)
2394 {
2395  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2396 }
2397 
2398 /// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
2399 /// 64-bit vector operands and computes the absolute value for each of the
2400 /// difference. Then sum of the 8 absolute differences is written to the
2401 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2406 ///
2407 /// \param __a
2408 /// A 64-bit integer vector containing one of the source operands.
2409 /// \param __b
2410 /// A 64-bit integer vector containing one of the source operands.
2411 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2412 /// sets of absolute differences between both operands. The upper bits are
2413 /// cleared.
2414 static __inline__ __m64 __DEFAULT_FN_ATTRS
2415 _mm_sad_pu8(__m64 __a, __m64 __b)
2416 {
2417  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2418 }
2419 
2420 #if defined(__cplusplus)
2421 extern "C" {
2422 #endif
2423 
2424 /// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
2425 /// integer value.
2426 ///
2427 /// There are several groups of macros associated with this
2428 /// intrinsic, including:
2429 /// <ul>
2430 /// <li>
2431 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2432 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2433 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2434 /// _MM_GET_EXCEPTION_STATE().
2435 /// </li>
2436 /// <li>
2437 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2438 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2439 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2440 /// </li>
2441 /// <li>
2442 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2443 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2444 /// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
2445 /// </li>
2446 /// <li>
2447 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2448 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2449 /// </li>
2450 /// <li>
2451 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2452 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2453 /// _MM_GET_DENORMALS_ZERO_MODE().
2454 /// </li>
2455 /// </ul>
2456 ///
2457 /// For example, the expression below checks if an overflow exception has
2458 /// occurred:
2459 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2460 ///
2461 /// The following example gets the current rounding mode:
2462 /// _MM_GET_ROUNDING_MODE()
2463 ///
2464 /// \headerfile <x86intrin.h>
2465 ///
2466 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2467 ///
2468 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2469 /// register.
2470 unsigned int _mm_getcsr(void);
2471 
2472 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
2473 ///
2474 /// There are several groups of macros associated with this intrinsic,
2475 /// including:
2476 /// <ul>
2477 /// <li>
2478 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2479 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2480 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2481 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2482 /// </li>
2483 /// <li>
2484 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2485 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2486 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2487 /// of these macros.
2488 /// </li>
2489 /// <li>
2490 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2491 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2492 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2493 /// </li>
2494 /// <li>
2495 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2496 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2497 /// one of these macros.
2498 /// </li>
2499 /// <li>
2500 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2501 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2502 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2503 /// </li>
2504 /// </ul>
2505 ///
2506 /// For example, the following expression causes subsequent floating-point
2507 /// operations to round up:
2508 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2509 ///
2510 /// The following example sets the DAZ and FTZ flags:
2511 /// void setFlags() {
2512 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
2513 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
2514 /// }
2515 ///
2516 /// \headerfile <x86intrin.h>
2517 ///
2518 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2519 ///
2520 /// \param __i
2521 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2522 void _mm_setcsr(unsigned int __i);
2523 
2524 #if defined(__cplusplus)
2525 } // extern "C"
2526 #endif
2527 
2528 /// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
2529 /// specified by the immediate value operand.
2530 ///
2531 /// \headerfile <x86intrin.h>
2532 ///
2533 /// \code
2534 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2535 /// \endcode
2536 ///
2537 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2538 ///
2539 /// \param a
2540 /// A 128-bit vector of [4 x float].
2541 /// \param b
2542 /// A 128-bit vector of [4 x float].
2543 /// \param mask
2544 /// An immediate value containing an 8-bit value specifying which elements to
2545 /// copy from \a a and \a b. \n
2546 /// Bits [3:0] specify the values copied from operand \a a. \n
2547 /// Bits [7:4] specify the values copied from operand \a b. \n
2548 /// The destinations within the 128-bit destination are assigned values as
2549 /// follows: \n
2550 /// Bits [1:0] are used to assign values to bits [31:0] in the
2551 /// destination. \n
2552 /// Bits [3:2] are used to assign values to bits [63:32] in the
2553 /// destination. \n
2554 /// Bits [5:4] are used to assign values to bits [95:64] in the
2555 /// destination. \n
2556 /// Bits [7:6] are used to assign values to bits [127:96] in the
2557 /// destination. \n
2558 /// Bit value assignments: \n
2559 /// 00: Bits [31:0] copied from the specified operand. \n
2560 /// 01: Bits [63:32] copied from the specified operand. \n
2561 /// 10: Bits [95:64] copied from the specified operand. \n
2562 /// 11: Bits [127:96] copied from the specified operand.
2563 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2564 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
2565  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2566  0 + (((mask) >> 0) & 0x3), \
2567  0 + (((mask) >> 2) & 0x3), \
2568  4 + (((mask) >> 4) & 0x3), \
2569  4 + (((mask) >> 6) & 0x3)); })
2570 
2571 /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2572 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2573 ///
2574 /// \headerfile <x86intrin.h>
2575 ///
2576 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2577 ///
2578 /// \param __a
2579 /// A 128-bit vector of [4 x float]. \n
2580 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2581 /// Bits [127:96] are written to bits [95:64] of the destination.
2582 /// \param __b
2583 /// A 128-bit vector of [4 x float].
2584 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2585 /// Bits [127:96] are written to bits [127:96] of the destination.
2586 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2587 static __inline__ __m128 __DEFAULT_FN_ATTRS
2588 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2589 {
2590  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2591 }
2592 
2593 /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2594 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2595 ///
2596 /// \headerfile <x86intrin.h>
2597 ///
2598 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2599 ///
2600 /// \param __a
2601 /// A 128-bit vector of [4 x float]. \n
2602 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2603 /// Bits [63:32] are written to bits [95:64] of the destination.
2604 /// \param __b
2605 /// A 128-bit vector of [4 x float]. \n
2606 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2607 /// Bits [63:32] are written to bits [127:96] of the destination.
2608 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2609 static __inline__ __m128 __DEFAULT_FN_ATTRS
2610 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2611 {
2612  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2613 }
2614 
2615 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2616 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2617 /// 96 bits are set to the upper 96 bits of the first parameter.
2618 ///
2619 /// \headerfile <x86intrin.h>
2620 ///
2621 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2622 ///
2623 /// \param __a
2624 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2625 /// written to the upper 96 bits of the result.
2626 /// \param __b
2627 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2628 /// written to the lower 32 bits of the result.
2629 /// \returns A 128-bit floating-point vector of [4 x float].
2630 static __inline__ __m128 __DEFAULT_FN_ATTRS
2631 _mm_move_ss(__m128 __a, __m128 __b)
2632 {
2633  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
2634 }
2635 
2636 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2637 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2638 /// 64 bits are set to the upper 64 bits of the first parameter.
2639 ///
2640 /// \headerfile <x86intrin.h>
2641 ///
2642 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2643 ///
2644 /// \param __a
2645 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2646 /// written to the upper 64 bits of the result.
2647 /// \param __b
2648 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2649 /// written to the lower 64 bits of the result.
2650 /// \returns A 128-bit floating-point vector of [4 x float].
2651 static __inline__ __m128 __DEFAULT_FN_ATTRS
2652 _mm_movehl_ps(__m128 __a, __m128 __b)
2653 {
2654  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2655 }
2656 
2657 /// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
2658 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2659 /// 64 bits are set to the lower 64 bits of the second parameter.
2660 ///
2661 /// \headerfile <x86intrin.h>
2662 ///
2663 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2664 ///
2665 /// \param __a
2666 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2667 /// written to the lower 64 bits of the result.
2668 /// \param __b
2669 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2670 /// written to the upper 64 bits of the result.
2671 /// \returns A 128-bit floating-point vector of [4 x float].
2672 static __inline__ __m128 __DEFAULT_FN_ATTRS
2673 _mm_movelh_ps(__m128 __a, __m128 __b)
2674 {
2675  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2676 }
2677 
2678 /// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2679 /// float].
2680 ///
2681 /// \headerfile <x86intrin.h>
2682 ///
2683 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2684 ///
2685 /// \param __a
2686 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2687 /// from the corresponding elements in this operand.
2688 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2689 /// values from the operand.
2690 static __inline__ __m128 __DEFAULT_FN_ATTRS
2691 _mm_cvtpi16_ps(__m64 __a)
2692 {
2693  __m64 __b, __c;
2694  __m128 __r;
2695 
2696  __b = _mm_setzero_si64();
2697  __b = _mm_cmpgt_pi16(__b, __a);
2698  __c = _mm_unpackhi_pi16(__a, __b);
2699  __r = _mm_setzero_ps();
2700  __r = _mm_cvtpi32_ps(__r, __c);
2701  __r = _mm_movelh_ps(__r, __r);
2702  __c = _mm_unpacklo_pi16(__a, __b);
2703  __r = _mm_cvtpi32_ps(__r, __c);
2704 
2705  return __r;
2706 }
2707 
2708 /// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
2709 /// 128-bit vector of [4 x float].
2710 ///
2711 /// \headerfile <x86intrin.h>
2712 ///
2713 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2714 ///
2715 /// \param __a
2716 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2717 /// destination are copied from the corresponding elements in this operand.
2718 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2719 /// values from the operand.
2720 static __inline__ __m128 __DEFAULT_FN_ATTRS
2721 _mm_cvtpu16_ps(__m64 __a)
2722 {
2723  __m64 __b, __c;
2724  __m128 __r;
2725 
2726  __b = _mm_setzero_si64();
2727  __c = _mm_unpackhi_pi16(__a, __b);
2728  __r = _mm_setzero_ps();
2729  __r = _mm_cvtpi32_ps(__r, __c);
2730  __r = _mm_movelh_ps(__r, __r);
2731  __c = _mm_unpacklo_pi16(__a, __b);
2732  __r = _mm_cvtpi32_ps(__r, __c);
2733 
2734  return __r;
2735 }
2736 
2737 /// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2738 /// into a 128-bit vector of [4 x float].
2739 ///
2740 /// \headerfile <x86intrin.h>
2741 ///
2742 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2743 ///
2744 /// \param __a
2745 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2746 /// from the corresponding lower 4 elements in this operand.
2747 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2748 /// values from the operand.
2749 static __inline__ __m128 __DEFAULT_FN_ATTRS
2750 _mm_cvtpi8_ps(__m64 __a)
2751 {
2752  __m64 __b;
2753 
2754  __b = _mm_setzero_si64();
2755  __b = _mm_cmpgt_pi8(__b, __a);
2756  __b = _mm_unpacklo_pi8(__a, __b);
2757 
2758  return _mm_cvtpi16_ps(__b);
2759 }
2760 
2761 /// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
2762 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2763 ///
2764 /// \headerfile <x86intrin.h>
2765 ///
2766 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2767 ///
2768 /// \param __a
2769 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2770 /// destination are copied from the corresponding lower 4 elements in this
2771 /// operand.
2772 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2773 /// values from the source operand.
2774 static __inline__ __m128 __DEFAULT_FN_ATTRS
2775 _mm_cvtpu8_ps(__m64 __a)
2776 {
2777  __m64 __b;
2778 
2779  __b = _mm_setzero_si64();
2780  __b = _mm_unpacklo_pi8(__a, __b);
2781 
2782  return _mm_cvtpi16_ps(__b);
2783 }
2784 
2785 /// \brief Converts the two 32-bit signed integer values from each 64-bit vector
2786 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2787 ///
2788 /// \headerfile <x86intrin.h>
2789 ///
2790 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2791 ///
2792 /// \param __a
2793 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2794 /// copied from the elements in this operand.
2795 /// \param __b
2796 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2797 /// copied from the elements in this operand.
2798 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2799 /// copied and converted values from the first operand. The upper 64 bits
2800 /// contain the copied and converted values from the second operand.
2801 static __inline__ __m128 __DEFAULT_FN_ATTRS
2802 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2803 {
2804  __m128 __c;
2805 
2806  __c = _mm_setzero_ps();
2807  __c = _mm_cvtpi32_ps(__c, __b);
2808  __c = _mm_movelh_ps(__c, __c);
2809 
2810  return _mm_cvtpi32_ps(__c, __a);
2811 }
2812 
2813 /// \brief Converts each single-precision floating-point element of a 128-bit
2814 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2815 /// packs the results into a 64-bit integer vector of [4 x i16].
2816 ///
2817 /// If the floating-point element is NaN or infinity, or if the
2818 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2819 /// it is converted to 0x8000. Otherwise if the floating-point element is
2820 /// greater than 0x7FFF, it is converted to 0x7FFF.
2821 ///
2822 /// \headerfile <x86intrin.h>
2823 ///
2824 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2825 ///
2826 /// \param __a
2827 /// A 128-bit floating-point vector of [4 x float].
2828 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2829 /// values.
2830 static __inline__ __m64 __DEFAULT_FN_ATTRS
2831 _mm_cvtps_pi16(__m128 __a)
2832 {
2833  __m64 __b, __c;
2834 
2835  __b = _mm_cvtps_pi32(__a);
2836  __a = _mm_movehl_ps(__a, __a);
2837  __c = _mm_cvtps_pi32(__a);
2838 
2839  return _mm_packs_pi32(__b, __c);
2840 }
2841 
2842 /// \brief Converts each single-precision floating-point element of a 128-bit
2843 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
2844 /// packs the results into the lower 32 bits of a 64-bit integer vector of
2845 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
2846 ///
2847 /// If the floating-point element is NaN or infinity, or if the
2848 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2849 /// is converted to 0x80. Otherwise if the floating-point element is greater
2850 /// than 0x7F, it is converted to 0x7F.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2855 ///
2856 /// \param __a
2857 /// 128-bit floating-point vector of [4 x float].
2858 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2859 /// converted values and the uppper 32 bits are set to zero.
2860 static __inline__ __m64 __DEFAULT_FN_ATTRS
2861 _mm_cvtps_pi8(__m128 __a)
2862 {
2863  __m64 __b, __c;
2864 
2865  __b = _mm_cvtps_pi16(__a);
2866  __c = _mm_setzero_si64();
2867 
2868  return _mm_packs_pi16(__b, __c);
2869 }
2870 
2871 /// \brief Extracts the sign bits from each single-precision floating-point
2872 /// element of a 128-bit floating-point vector of [4 x float] and returns the
2873 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2874 /// to zero.
2875 ///
2876 /// \headerfile <x86intrin.h>
2877 ///
2878 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2879 ///
2880 /// \param __a
2881 /// A 128-bit floating-point vector of [4 x float].
2882 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2883 /// single-precision floating-point element of the parameter. Bits [31:4] are
2884 /// set to zero.
2885 static __inline__ int __DEFAULT_FN_ATTRS
2886 _mm_movemask_ps(__m128 __a)
2887 {
2888  return __builtin_ia32_movmskps((__v4sf)__a);
2889 }
2890 
2891 
2892 #define _MM_ALIGN16 __attribute__((aligned(16)))
2893 
2894 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2895 
2896 #define _MM_EXCEPT_INVALID (0x0001)
2897 #define _MM_EXCEPT_DENORM (0x0002)
2898 #define _MM_EXCEPT_DIV_ZERO (0x0004)
2899 #define _MM_EXCEPT_OVERFLOW (0x0008)
2900 #define _MM_EXCEPT_UNDERFLOW (0x0010)
2901 #define _MM_EXCEPT_INEXACT (0x0020)
2902 #define _MM_EXCEPT_MASK (0x003f)
2903 
2904 #define _MM_MASK_INVALID (0x0080)
2905 #define _MM_MASK_DENORM (0x0100)
2906 #define _MM_MASK_DIV_ZERO (0x0200)
2907 #define _MM_MASK_OVERFLOW (0x0400)
2908 #define _MM_MASK_UNDERFLOW (0x0800)
2909 #define _MM_MASK_INEXACT (0x1000)
2910 #define _MM_MASK_MASK (0x1f80)
2911 
2912 #define _MM_ROUND_NEAREST (0x0000)
2913 #define _MM_ROUND_DOWN (0x2000)
2914 #define _MM_ROUND_UP (0x4000)
2915 #define _MM_ROUND_TOWARD_ZERO (0x6000)
2916 #define _MM_ROUND_MASK (0x6000)
2917 
2918 #define _MM_FLUSH_ZERO_MASK (0x8000)
2919 #define _MM_FLUSH_ZERO_ON (0x8000)
2920 #define _MM_FLUSH_ZERO_OFF (0x0000)
2921 
2922 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2923 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2924 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2925 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2926 
2927 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2928 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2929 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2930 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2931 
2932 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2933 do { \
2934  __m128 tmp3, tmp2, tmp1, tmp0; \
2935  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2936  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2937  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2938  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2939  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2940  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2941  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2942  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2943 } while (0)
2944 
2945 /* Aliases for compatibility. */
2946 #define _m_pextrw _mm_extract_pi16
2947 #define _m_pinsrw _mm_insert_pi16
2948 #define _m_pmaxsw _mm_max_pi16
2949 #define _m_pmaxub _mm_max_pu8
2950 #define _m_pminsw _mm_min_pi16
2951 #define _m_pminub _mm_min_pu8
2952 #define _m_pmovmskb _mm_movemask_pi8
2953 #define _m_pmulhuw _mm_mulhi_pu16
2954 #define _m_pshufw _mm_shuffle_pi16
2955 #define _m_maskmovq _mm_maskmove_si64
2956 #define _m_pavgb _mm_avg_pu8
2957 #define _m_pavgw _mm_avg_pu16
2958 #define _m_psadbw _mm_sad_pu8
2959 #define _m_ _mm_
2960 #define _m_ _mm_
2961 
2962 #undef __DEFAULT_FN_ATTRS
2963 
2964 /* Ugly hack for backwards-compatibility (compatible with gcc) */
2965 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
2966 #include <emmintrin.h>
2967 #endif
2968 
2969 #endif /* __XMMINTRIN_H */
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1932
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:986
static __inline unsigned char unsigned int unsigned int __y
Definition: adxintrin.h:36
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:322
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:339
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1602
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:499
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2588
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1468
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1154
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:103
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2861
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:757
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2236
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:298
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values...
Definition: xmmintrin.h:358
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1811
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:1993
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition. ...
Definition: xmmintrin.h:81
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1296
int __v4si __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:29
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:670
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location...
Definition: xmmintrin.h:2106
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2652
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts 16-bit signed integers from both 64-bit integer vector parameters of [4 x i16] into 8-bit si...
Definition: mmintrin.h:141
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:251
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1898
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2013
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:602
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2610
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1629
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:583
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:124
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1070
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1754
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:381
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2198
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1261
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1390
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:540
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1336
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values...
Definition: xmmintrin.h:400
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1446
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1792
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2631
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:847
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2032
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1953
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1429
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1541
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1656
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:961
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:146
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:243
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:733
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1239
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float]. ...
Definition: xmmintrin.h:1581
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1006
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2255
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:38
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:2886
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1175
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2087
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:822
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1881
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:279
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:869
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:1974
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2292
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:777
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:941
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:477
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2673
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:916
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1197
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2415
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2775
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1262
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:646
static __inline__ vector float vector float __b
Definition: altivec.h:534
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create a 16-bit ...
Definition: xmmintrin.h:2273
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:225
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:316
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1774
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:188
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:1027
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1740
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:625
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1491
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1838
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:261
static __inline unsigned char unsigned int __x
Definition: adxintrin.h:36
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:299
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:517
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:2721
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2374
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:559
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1112
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1371
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one&#39;s complement of the value...
Definition: xmmintrin.h:440
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:1866
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2393
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:166
#define __DEFAULT_FN_ATTRS
Definition: xmmintrin.h:43
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:207
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1133
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:691
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1564
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2831
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:458
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1298
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2691
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1049
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition: xmmintrin.h:714
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:418
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:894
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts 32-bit signed integers from both 64-bit integer vector parameters of [2 x i32] into 16-bit s...
Definition: mmintrin.h:171
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1219
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2750
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1718
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1241
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1701
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2355
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2217
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1915
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:802
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1091
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4199
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2802
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:61
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1352
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1280