clang 18.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef int __v4si __attribute__((__vector_size__(16)));
20typedef float __v4sf __attribute__((__vector_size__(16)));
21typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25/* Unsigned types */
26typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28/* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30#if __STDC_HOSTED__
31#include <mm_malloc.h>
32#endif
33
34/* Define the default attributes for the functions in this file. */
35#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
36#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
37
38/// Adds the 32-bit float values in the low-order bits of the operands.
39///
40/// \headerfile <x86intrin.h>
41///
42/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
43///
44/// \param __a
45/// A 128-bit vector of [4 x float] containing one of the source operands.
46/// The lower 32 bits of this operand are used in the calculation.
47/// \param __b
48/// A 128-bit vector of [4 x float] containing one of the source operands.
49/// The lower 32 bits of this operand are used in the calculation.
50/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
51/// of the lower 32 bits of both operands. The upper 96 bits are copied from
52/// the upper 96 bits of the first source operand.
53static __inline__ __m128 __DEFAULT_FN_ATTRS
54_mm_add_ss(__m128 __a, __m128 __b)
55{
56 __a[0] += __b[0];
57 return __a;
58}
59
60/// Adds two 128-bit vectors of [4 x float], and returns the results of
61/// the addition.
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
66///
67/// \param __a
68/// A 128-bit vector of [4 x float] containing one of the source operands.
69/// \param __b
70/// A 128-bit vector of [4 x float] containing one of the source operands.
71/// \returns A 128-bit vector of [4 x float] containing the sums of both
72/// operands.
73static __inline__ __m128 __DEFAULT_FN_ATTRS
74_mm_add_ps(__m128 __a, __m128 __b)
75{
76 return (__m128)((__v4sf)__a + (__v4sf)__b);
77}
78
79/// Subtracts the 32-bit float value in the low-order bits of the second
80/// operand from the corresponding value in the first operand.
81///
82/// \headerfile <x86intrin.h>
83///
84/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
85///
86/// \param __a
87/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
88/// of this operand are used in the calculation.
89/// \param __b
90/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
91/// bits of this operand are used in the calculation.
92/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
93/// difference of the lower 32 bits of both operands. The upper 96 bits are
94/// copied from the upper 96 bits of the first source operand.
95static __inline__ __m128 __DEFAULT_FN_ATTRS
96_mm_sub_ss(__m128 __a, __m128 __b)
97{
98 __a[0] -= __b[0];
99 return __a;
100}
101
102/// Subtracts each of the values of the second operand from the first
103/// operand, both of which are 128-bit vectors of [4 x float] and returns
104/// the results of the subtraction.
105///
106/// \headerfile <x86intrin.h>
107///
108/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
109///
110/// \param __a
111/// A 128-bit vector of [4 x float] containing the minuend.
112/// \param __b
113/// A 128-bit vector of [4 x float] containing the subtrahend.
114/// \returns A 128-bit vector of [4 x float] containing the differences between
115/// both operands.
116static __inline__ __m128 __DEFAULT_FN_ATTRS
117_mm_sub_ps(__m128 __a, __m128 __b)
118{
119 return (__m128)((__v4sf)__a - (__v4sf)__b);
120}
121
122/// Multiplies two 32-bit float values in the low-order bits of the
123/// operands.
124///
125/// \headerfile <x86intrin.h>
126///
127/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
128///
129/// \param __a
130/// A 128-bit vector of [4 x float] containing one of the source operands.
131/// The lower 32 bits of this operand are used in the calculation.
132/// \param __b
133/// A 128-bit vector of [4 x float] containing one of the source operands.
134/// The lower 32 bits of this operand are used in the calculation.
135/// \returns A 128-bit vector of [4 x float] containing the product of the lower
136/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
137/// bits of the first source operand.
138static __inline__ __m128 __DEFAULT_FN_ATTRS
139_mm_mul_ss(__m128 __a, __m128 __b)
140{
141 __a[0] *= __b[0];
142 return __a;
143}
144
145/// Multiplies two 128-bit vectors of [4 x float] and returns the
146/// results of the multiplication.
147///
148/// \headerfile <x86intrin.h>
149///
150/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
151///
152/// \param __a
153/// A 128-bit vector of [4 x float] containing one of the source operands.
154/// \param __b
155/// A 128-bit vector of [4 x float] containing one of the source operands.
156/// \returns A 128-bit vector of [4 x float] containing the products of both
157/// operands.
158static __inline__ __m128 __DEFAULT_FN_ATTRS
159_mm_mul_ps(__m128 __a, __m128 __b)
160{
161 return (__m128)((__v4sf)__a * (__v4sf)__b);
162}
163
164/// Divides the value in the low-order 32 bits of the first operand by
165/// the corresponding value in the second operand.
166///
167/// \headerfile <x86intrin.h>
168///
169/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
170///
171/// \param __a
172/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
173/// bits of this operand are used in the calculation.
174/// \param __b
175/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
176/// of this operand are used in the calculation.
177/// \returns A 128-bit vector of [4 x float] containing the quotients of the
178/// lower 32 bits of both operands. The upper 96 bits are copied from the
179/// upper 96 bits of the first source operand.
180static __inline__ __m128 __DEFAULT_FN_ATTRS
181_mm_div_ss(__m128 __a, __m128 __b)
182{
183 __a[0] /= __b[0];
184 return __a;
185}
186
187/// Divides two 128-bit vectors of [4 x float].
188///
189/// \headerfile <x86intrin.h>
190///
191/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
192///
193/// \param __a
194/// A 128-bit vector of [4 x float] containing the dividend.
195/// \param __b
196/// A 128-bit vector of [4 x float] containing the divisor.
197/// \returns A 128-bit vector of [4 x float] containing the quotients of both
198/// operands.
199static __inline__ __m128 __DEFAULT_FN_ATTRS
200_mm_div_ps(__m128 __a, __m128 __b)
201{
202 return (__m128)((__v4sf)__a / (__v4sf)__b);
203}
204
205/// Calculates the square root of the value stored in the low-order bits
206/// of a 128-bit vector of [4 x float].
207///
208/// \headerfile <x86intrin.h>
209///
210/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
211///
212/// \param __a
213/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
214/// used in the calculation.
215/// \returns A 128-bit vector of [4 x float] containing the square root of the
216/// value in the low-order bits of the operand.
217static __inline__ __m128 __DEFAULT_FN_ATTRS
219{
220 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
221}
222
223/// Calculates the square roots of the values stored in a 128-bit vector
224/// of [4 x float].
225///
226/// \headerfile <x86intrin.h>
227///
228/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
229///
230/// \param __a
231/// A 128-bit vector of [4 x float].
232/// \returns A 128-bit vector of [4 x float] containing the square roots of the
233/// values in the operand.
234static __inline__ __m128 __DEFAULT_FN_ATTRS
236{
237 return __builtin_ia32_sqrtps((__v4sf)__a);
238}
239
240/// Calculates the approximate reciprocal of the value stored in the
241/// low-order bits of a 128-bit vector of [4 x float].
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
246///
247/// \param __a
248/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
249/// used in the calculation.
250/// \returns A 128-bit vector of [4 x float] containing the approximate
251/// reciprocal of the value in the low-order bits of the operand.
252static __inline__ __m128 __DEFAULT_FN_ATTRS
254{
255 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
256}
257
258/// Calculates the approximate reciprocals of the values stored in a
259/// 128-bit vector of [4 x float].
260///
261/// \headerfile <x86intrin.h>
262///
263/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
264///
265/// \param __a
266/// A 128-bit vector of [4 x float].
267/// \returns A 128-bit vector of [4 x float] containing the approximate
268/// reciprocals of the values in the operand.
269static __inline__ __m128 __DEFAULT_FN_ATTRS
271{
272 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
273}
274
275/// Calculates the approximate reciprocal of the square root of the value
276/// stored in the low-order bits of a 128-bit vector of [4 x float].
277///
278/// \headerfile <x86intrin.h>
279///
280/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
281///
282/// \param __a
283/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
284/// used in the calculation.
285/// \returns A 128-bit vector of [4 x float] containing the approximate
286/// reciprocal of the square root of the value in the low-order bits of the
287/// operand.
288static __inline__ __m128 __DEFAULT_FN_ATTRS
290{
291 return __builtin_ia32_rsqrtss((__v4sf)__a);
292}
293
294/// Calculates the approximate reciprocals of the square roots of the
295/// values stored in a 128-bit vector of [4 x float].
296///
297/// \headerfile <x86intrin.h>
298///
299/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
300///
301/// \param __a
302/// A 128-bit vector of [4 x float].
303/// \returns A 128-bit vector of [4 x float] containing the approximate
304/// reciprocals of the square roots of the values in the operand.
305static __inline__ __m128 __DEFAULT_FN_ATTRS
307{
308 return __builtin_ia32_rsqrtps((__v4sf)__a);
309}
310
311/// Compares two 32-bit float values in the low-order bits of both
312/// operands and returns the lesser value in the low-order bits of the
313/// vector of [4 x float].
314///
315/// \headerfile <x86intrin.h>
316///
317/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
318///
319/// \param __a
320/// A 128-bit vector of [4 x float] containing one of the operands. The lower
321/// 32 bits of this operand are used in the comparison.
322/// \param __b
323/// A 128-bit vector of [4 x float] containing one of the operands. The lower
324/// 32 bits of this operand are used in the comparison.
325/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
326/// minimum value between both operands. The upper 96 bits are copied from
327/// the upper 96 bits of the first source operand.
328static __inline__ __m128 __DEFAULT_FN_ATTRS
329_mm_min_ss(__m128 __a, __m128 __b)
330{
331 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
332}
333
334/// Compares two 128-bit vectors of [4 x float] and returns the lesser
335/// of each pair of values.
336///
337/// \headerfile <x86intrin.h>
338///
339/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
340///
341/// \param __a
342/// A 128-bit vector of [4 x float] containing one of the operands.
343/// \param __b
344/// A 128-bit vector of [4 x float] containing one of the operands.
345/// \returns A 128-bit vector of [4 x float] containing the minimum values
346/// between both operands.
347static __inline__ __m128 __DEFAULT_FN_ATTRS
348_mm_min_ps(__m128 __a, __m128 __b)
349{
350 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
351}
352
353/// Compares two 32-bit float values in the low-order bits of both
354/// operands and returns the greater value in the low-order bits of a 128-bit
355/// vector of [4 x float].
356///
357/// \headerfile <x86intrin.h>
358///
359/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
360///
361/// \param __a
362/// A 128-bit vector of [4 x float] containing one of the operands. The lower
363/// 32 bits of this operand are used in the comparison.
364/// \param __b
365/// A 128-bit vector of [4 x float] containing one of the operands. The lower
366/// 32 bits of this operand are used in the comparison.
367/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
368/// maximum value between both operands. The upper 96 bits are copied from
369/// the upper 96 bits of the first source operand.
370static __inline__ __m128 __DEFAULT_FN_ATTRS
371_mm_max_ss(__m128 __a, __m128 __b)
372{
373 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
374}
375
376/// Compares two 128-bit vectors of [4 x float] and returns the greater
377/// of each pair of values.
378///
379/// \headerfile <x86intrin.h>
380///
381/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
382///
383/// \param __a
384/// A 128-bit vector of [4 x float] containing one of the operands.
385/// \param __b
386/// A 128-bit vector of [4 x float] containing one of the operands.
387/// \returns A 128-bit vector of [4 x float] containing the maximum values
388/// between both operands.
389static __inline__ __m128 __DEFAULT_FN_ATTRS
390_mm_max_ps(__m128 __a, __m128 __b)
391{
392 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
393}
394
395/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
396///
397/// \headerfile <x86intrin.h>
398///
399/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
400///
401/// \param __a
402/// A 128-bit vector containing one of the source operands.
403/// \param __b
404/// A 128-bit vector containing one of the source operands.
405/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
406/// values between both operands.
407static __inline__ __m128 __DEFAULT_FN_ATTRS
408_mm_and_ps(__m128 __a, __m128 __b)
409{
410 return (__m128)((__v4su)__a & (__v4su)__b);
411}
412
413/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
414/// the one's complement of the values contained in the first source
415/// operand.
416///
417/// \headerfile <x86intrin.h>
418///
419/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
420///
421/// \param __a
422/// A 128-bit vector of [4 x float] containing the first source operand. The
423/// one's complement of this value is used in the bitwise AND.
424/// \param __b
425/// A 128-bit vector of [4 x float] containing the second source operand.
426/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
427/// one's complement of the first operand and the values in the second
428/// operand.
429static __inline__ __m128 __DEFAULT_FN_ATTRS
430_mm_andnot_ps(__m128 __a, __m128 __b)
431{
432 return (__m128)(~(__v4su)__a & (__v4su)__b);
433}
434
435/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
436///
437/// \headerfile <x86intrin.h>
438///
439/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
440///
441/// \param __a
442/// A 128-bit vector of [4 x float] containing one of the source operands.
443/// \param __b
444/// A 128-bit vector of [4 x float] containing one of the source operands.
445/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
446/// values between both operands.
447static __inline__ __m128 __DEFAULT_FN_ATTRS
448_mm_or_ps(__m128 __a, __m128 __b)
449{
450 return (__m128)((__v4su)__a | (__v4su)__b);
451}
452
453/// Performs a bitwise exclusive OR of two 128-bit vectors of
454/// [4 x float].
455///
456/// \headerfile <x86intrin.h>
457///
458/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
459///
460/// \param __a
461/// A 128-bit vector of [4 x float] containing one of the source operands.
462/// \param __b
463/// A 128-bit vector of [4 x float] containing one of the source operands.
464/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
465/// of the values between both operands.
466static __inline__ __m128 __DEFAULT_FN_ATTRS
467_mm_xor_ps(__m128 __a, __m128 __b)
468{
469 return (__m128)((__v4su)__a ^ (__v4su)__b);
470}
471
472/// Compares two 32-bit float values in the low-order bits of both
473/// operands for equality and returns the result of the comparison in the
474/// low-order bits of a vector [4 x float].
475///
476/// \headerfile <x86intrin.h>
477///
478/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
479///
480/// \param __a
481/// A 128-bit vector of [4 x float] containing one of the operands. The lower
482/// 32 bits of this operand are used in the comparison.
483/// \param __b
484/// A 128-bit vector of [4 x float] containing one of the operands. The lower
485/// 32 bits of this operand are used in the comparison.
486/// \returns A 128-bit vector of [4 x float] containing the comparison results
487/// in the low-order bits.
488static __inline__ __m128 __DEFAULT_FN_ATTRS
489_mm_cmpeq_ss(__m128 __a, __m128 __b)
490{
491 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
492}
493
494/// Compares each of the corresponding 32-bit float values of the
495/// 128-bit vectors of [4 x float] for equality.
496///
497/// \headerfile <x86intrin.h>
498///
499/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
500///
501/// \param __a
502/// A 128-bit vector of [4 x float].
503/// \param __b
504/// A 128-bit vector of [4 x float].
505/// \returns A 128-bit vector of [4 x float] containing the comparison results.
506static __inline__ __m128 __DEFAULT_FN_ATTRS
507_mm_cmpeq_ps(__m128 __a, __m128 __b)
508{
509 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
510}
511
512/// Compares two 32-bit float values in the low-order bits of both
513/// operands to determine if the value in the first operand is less than the
514/// corresponding value in the second operand and returns the result of the
515/// comparison in the low-order bits of a vector of [4 x float].
516///
517/// \headerfile <x86intrin.h>
518///
519/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
520///
521/// \param __a
522/// A 128-bit vector of [4 x float] containing one of the operands. The lower
523/// 32 bits of this operand are used in the comparison.
524/// \param __b
525/// A 128-bit vector of [4 x float] containing one of the operands. The lower
526/// 32 bits of this operand are used in the comparison.
527/// \returns A 128-bit vector of [4 x float] containing the comparison results
528/// in the low-order bits.
529static __inline__ __m128 __DEFAULT_FN_ATTRS
530_mm_cmplt_ss(__m128 __a, __m128 __b)
531{
532 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
533}
534
535/// Compares each of the corresponding 32-bit float values of the
536/// 128-bit vectors of [4 x float] to determine if the values in the first
537/// operand are less than those in the second operand.
538///
539/// \headerfile <x86intrin.h>
540///
541/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
542///
543/// \param __a
544/// A 128-bit vector of [4 x float].
545/// \param __b
546/// A 128-bit vector of [4 x float].
547/// \returns A 128-bit vector of [4 x float] containing the comparison results.
548static __inline__ __m128 __DEFAULT_FN_ATTRS
549_mm_cmplt_ps(__m128 __a, __m128 __b)
550{
551 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
552}
553
554/// Compares two 32-bit float values in the low-order bits of both
555/// operands to determine if the value in the first operand is less than or
556/// equal to the corresponding value in the second operand and returns the
557/// result of the comparison in the low-order bits of a vector of
558/// [4 x float].
559///
560/// \headerfile <x86intrin.h>
561///
562/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
563///
564/// \param __a
565/// A 128-bit vector of [4 x float] containing one of the operands. The lower
566/// 32 bits of this operand are used in the comparison.
567/// \param __b
568/// A 128-bit vector of [4 x float] containing one of the operands. The lower
569/// 32 bits of this operand are used in the comparison.
570/// \returns A 128-bit vector of [4 x float] containing the comparison results
571/// in the low-order bits.
572static __inline__ __m128 __DEFAULT_FN_ATTRS
573_mm_cmple_ss(__m128 __a, __m128 __b)
574{
575 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
576}
577
578/// Compares each of the corresponding 32-bit float values of the
579/// 128-bit vectors of [4 x float] to determine if the values in the first
580/// operand are less than or equal to those in the second operand.
581///
582/// \headerfile <x86intrin.h>
583///
584/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
585///
586/// \param __a
587/// A 128-bit vector of [4 x float].
588/// \param __b
589/// A 128-bit vector of [4 x float].
590/// \returns A 128-bit vector of [4 x float] containing the comparison results.
591static __inline__ __m128 __DEFAULT_FN_ATTRS
592_mm_cmple_ps(__m128 __a, __m128 __b)
593{
594 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
595}
596
597/// Compares two 32-bit float values in the low-order bits of both
598/// operands to determine if the value in the first operand is greater than
599/// the corresponding value in the second operand and returns the result of
600/// the comparison in the low-order bits of a vector of [4 x float].
601///
602/// \headerfile <x86intrin.h>
603///
604/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
605///
606/// \param __a
607/// A 128-bit vector of [4 x float] containing one of the operands. The lower
608/// 32 bits of this operand are used in the comparison.
609/// \param __b
610/// A 128-bit vector of [4 x float] containing one of the operands. The lower
611/// 32 bits of this operand are used in the comparison.
612/// \returns A 128-bit vector of [4 x float] containing the comparison results
613/// in the low-order bits.
614static __inline__ __m128 __DEFAULT_FN_ATTRS
615_mm_cmpgt_ss(__m128 __a, __m128 __b)
616{
617 return (__m128)__builtin_shufflevector((__v4sf)__a,
618 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
619 4, 1, 2, 3);
620}
621
622/// Compares each of the corresponding 32-bit float values of the
623/// 128-bit vectors of [4 x float] to determine if the values in the first
624/// operand are greater than those in the second operand.
625///
626/// \headerfile <x86intrin.h>
627///
628/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
629///
630/// \param __a
631/// A 128-bit vector of [4 x float].
632/// \param __b
633/// A 128-bit vector of [4 x float].
634/// \returns A 128-bit vector of [4 x float] containing the comparison results.
635static __inline__ __m128 __DEFAULT_FN_ATTRS
636_mm_cmpgt_ps(__m128 __a, __m128 __b)
637{
638 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
639}
640
641/// Compares two 32-bit float values in the low-order bits of both
642/// operands to determine if the value in the first operand is greater than
643/// or equal to the corresponding value in the second operand and returns
644/// the result of the comparison in the low-order bits of a vector of
645/// [4 x float].
646///
647/// \headerfile <x86intrin.h>
648///
649/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
650///
651/// \param __a
652/// A 128-bit vector of [4 x float] containing one of the operands. The lower
653/// 32 bits of this operand are used in the comparison.
654/// \param __b
655/// A 128-bit vector of [4 x float] containing one of the operands. The lower
656/// 32 bits of this operand are used in the comparison.
657/// \returns A 128-bit vector of [4 x float] containing the comparison results
658/// in the low-order bits.
659static __inline__ __m128 __DEFAULT_FN_ATTRS
660_mm_cmpge_ss(__m128 __a, __m128 __b)
661{
662 return (__m128)__builtin_shufflevector((__v4sf)__a,
663 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
664 4, 1, 2, 3);
665}
666
667/// Compares each of the corresponding 32-bit float values of the
668/// 128-bit vectors of [4 x float] to determine if the values in the first
669/// operand are greater than or equal to those in the second operand.
670///
671/// \headerfile <x86intrin.h>
672///
673/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
674///
675/// \param __a
676/// A 128-bit vector of [4 x float].
677/// \param __b
678/// A 128-bit vector of [4 x float].
679/// \returns A 128-bit vector of [4 x float] containing the comparison results.
680static __inline__ __m128 __DEFAULT_FN_ATTRS
681_mm_cmpge_ps(__m128 __a, __m128 __b)
682{
683 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
684}
685
686/// Compares two 32-bit float values in the low-order bits of both
687/// operands for inequality and returns the result of the comparison in the
688/// low-order bits of a vector of [4 x float].
689///
690/// \headerfile <x86intrin.h>
691///
692/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
693/// instructions.
694///
695/// \param __a
696/// A 128-bit vector of [4 x float] containing one of the operands. The lower
697/// 32 bits of this operand are used in the comparison.
698/// \param __b
699/// A 128-bit vector of [4 x float] containing one of the operands. The lower
700/// 32 bits of this operand are used in the comparison.
701/// \returns A 128-bit vector of [4 x float] containing the comparison results
702/// in the low-order bits.
703static __inline__ __m128 __DEFAULT_FN_ATTRS
704_mm_cmpneq_ss(__m128 __a, __m128 __b)
705{
706 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
707}
708
709/// Compares each of the corresponding 32-bit float values of the
710/// 128-bit vectors of [4 x float] for inequality.
711///
712/// \headerfile <x86intrin.h>
713///
714/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
715/// instructions.
716///
717/// \param __a
718/// A 128-bit vector of [4 x float].
719/// \param __b
720/// A 128-bit vector of [4 x float].
721/// \returns A 128-bit vector of [4 x float] containing the comparison results.
722static __inline__ __m128 __DEFAULT_FN_ATTRS
723_mm_cmpneq_ps(__m128 __a, __m128 __b)
724{
725 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
726}
727
728/// Compares two 32-bit float values in the low-order bits of both
729/// operands to determine if the value in the first operand is not less than
730/// the corresponding value in the second operand and returns the result of
731/// the comparison in the low-order bits of a vector of [4 x float].
732///
733/// \headerfile <x86intrin.h>
734///
735/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
736/// instructions.
737///
738/// \param __a
739/// A 128-bit vector of [4 x float] containing one of the operands. The lower
740/// 32 bits of this operand are used in the comparison.
741/// \param __b
742/// A 128-bit vector of [4 x float] containing one of the operands. The lower
743/// 32 bits of this operand are used in the comparison.
744/// \returns A 128-bit vector of [4 x float] containing the comparison results
745/// in the low-order bits.
746static __inline__ __m128 __DEFAULT_FN_ATTRS
747_mm_cmpnlt_ss(__m128 __a, __m128 __b)
748{
749 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
750}
751
752/// Compares each of the corresponding 32-bit float values of the
753/// 128-bit vectors of [4 x float] to determine if the values in the first
754/// operand are not less than those in the second operand.
755///
756/// \headerfile <x86intrin.h>
757///
758/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
759/// instructions.
760///
761/// \param __a
762/// A 128-bit vector of [4 x float].
763/// \param __b
764/// A 128-bit vector of [4 x float].
765/// \returns A 128-bit vector of [4 x float] containing the comparison results.
766static __inline__ __m128 __DEFAULT_FN_ATTRS
767_mm_cmpnlt_ps(__m128 __a, __m128 __b)
768{
769 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
770}
771
772/// Compares two 32-bit float values in the low-order bits of both
773/// operands to determine if the value in the first operand is not less than
774/// or equal to the corresponding value in the second operand and returns
775/// the result of the comparison in the low-order bits of a vector of
776/// [4 x float].
777///
778/// \headerfile <x86intrin.h>
779///
780/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
781/// instructions.
782///
783/// \param __a
784/// A 128-bit vector of [4 x float] containing one of the operands. The lower
785/// 32 bits of this operand are used in the comparison.
786/// \param __b
787/// A 128-bit vector of [4 x float] containing one of the operands. The lower
788/// 32 bits of this operand are used in the comparison.
789/// \returns A 128-bit vector of [4 x float] containing the comparison results
790/// in the low-order bits.
791static __inline__ __m128 __DEFAULT_FN_ATTRS
792_mm_cmpnle_ss(__m128 __a, __m128 __b)
793{
794 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
795}
796
797/// Compares each of the corresponding 32-bit float values of the
798/// 128-bit vectors of [4 x float] to determine if the values in the first
799/// operand are not less than or equal to those in the second operand.
800///
801/// \headerfile <x86intrin.h>
802///
803/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
804/// instructions.
805///
806/// \param __a
807/// A 128-bit vector of [4 x float].
808/// \param __b
809/// A 128-bit vector of [4 x float].
810/// \returns A 128-bit vector of [4 x float] containing the comparison results.
811static __inline__ __m128 __DEFAULT_FN_ATTRS
812_mm_cmpnle_ps(__m128 __a, __m128 __b)
813{
814 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
815}
816
817/// Compares two 32-bit float values in the low-order bits of both
818/// operands to determine if the value in the first operand is not greater
819/// than the corresponding value in the second operand and returns the
820/// result of the comparison in the low-order bits of a vector of
821/// [4 x float].
822///
823/// \headerfile <x86intrin.h>
824///
825/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
826/// instructions.
827///
828/// \param __a
829/// A 128-bit vector of [4 x float] containing one of the operands. The lower
830/// 32 bits of this operand are used in the comparison.
831/// \param __b
832/// A 128-bit vector of [4 x float] containing one of the operands. The lower
833/// 32 bits of this operand are used in the comparison.
834/// \returns A 128-bit vector of [4 x float] containing the comparison results
835/// in the low-order bits.
836static __inline__ __m128 __DEFAULT_FN_ATTRS
837_mm_cmpngt_ss(__m128 __a, __m128 __b)
838{
839 return (__m128)__builtin_shufflevector((__v4sf)__a,
840 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
841 4, 1, 2, 3);
842}
843
844/// Compares each of the corresponding 32-bit float values of the
845/// 128-bit vectors of [4 x float] to determine if the values in the first
846/// operand are not greater than those in the second operand.
847///
848/// \headerfile <x86intrin.h>
849///
850/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
851/// instructions.
852///
853/// \param __a
854/// A 128-bit vector of [4 x float].
855/// \param __b
856/// A 128-bit vector of [4 x float].
857/// \returns A 128-bit vector of [4 x float] containing the comparison results.
858static __inline__ __m128 __DEFAULT_FN_ATTRS
859_mm_cmpngt_ps(__m128 __a, __m128 __b)
860{
861 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
862}
863
864/// Compares two 32-bit float values in the low-order bits of both
865/// operands to determine if the value in the first operand is not greater
866/// than or equal to the corresponding value in the second operand and
867/// returns the result of the comparison in the low-order bits of a vector
868/// of [4 x float].
869///
870/// \headerfile <x86intrin.h>
871///
872/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
873/// instructions.
874///
875/// \param __a
876/// A 128-bit vector of [4 x float] containing one of the operands. The lower
877/// 32 bits of this operand are used in the comparison.
878/// \param __b
879/// A 128-bit vector of [4 x float] containing one of the operands. The lower
880/// 32 bits of this operand are used in the comparison.
881/// \returns A 128-bit vector of [4 x float] containing the comparison results
882/// in the low-order bits.
883static __inline__ __m128 __DEFAULT_FN_ATTRS
884_mm_cmpnge_ss(__m128 __a, __m128 __b)
885{
886 return (__m128)__builtin_shufflevector((__v4sf)__a,
887 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
888 4, 1, 2, 3);
889}
890
891/// Compares each of the corresponding 32-bit float values of the
892/// 128-bit vectors of [4 x float] to determine if the values in the first
893/// operand are not greater than or equal to those in the second operand.
894///
895/// \headerfile <x86intrin.h>
896///
897/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
898/// instructions.
899///
900/// \param __a
901/// A 128-bit vector of [4 x float].
902/// \param __b
903/// A 128-bit vector of [4 x float].
904/// \returns A 128-bit vector of [4 x float] containing the comparison results.
905static __inline__ __m128 __DEFAULT_FN_ATTRS
906_mm_cmpnge_ps(__m128 __a, __m128 __b)
907{
908 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
909}
910
911/// Compares two 32-bit float values in the low-order bits of both
912/// operands to determine if the value in the first operand is ordered with
913/// respect to the corresponding value in the second operand and returns the
914/// result of the comparison in the low-order bits of a vector of
915/// [4 x float].
916///
917/// \headerfile <x86intrin.h>
918///
919/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
920/// instructions.
921///
922/// \param __a
923/// A 128-bit vector of [4 x float] containing one of the operands. The lower
924/// 32 bits of this operand are used in the comparison.
925/// \param __b
926/// A 128-bit vector of [4 x float] containing one of the operands. The lower
927/// 32 bits of this operand are used in the comparison.
928/// \returns A 128-bit vector of [4 x float] containing the comparison results
929/// in the low-order bits.
930static __inline__ __m128 __DEFAULT_FN_ATTRS
931_mm_cmpord_ss(__m128 __a, __m128 __b)
932{
933 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
934}
935
936/// Compares each of the corresponding 32-bit float values of the
937/// 128-bit vectors of [4 x float] to determine if the values in the first
938/// operand are ordered with respect to those in the second operand.
939///
940/// \headerfile <x86intrin.h>
941///
942/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
943/// instructions.
944///
945/// \param __a
946/// A 128-bit vector of [4 x float].
947/// \param __b
948/// A 128-bit vector of [4 x float].
949/// \returns A 128-bit vector of [4 x float] containing the comparison results.
950static __inline__ __m128 __DEFAULT_FN_ATTRS
951_mm_cmpord_ps(__m128 __a, __m128 __b)
952{
953 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
954}
955
956/// Compares two 32-bit float values in the low-order bits of both
957/// operands to determine if the value in the first operand is unordered
958/// with respect to the corresponding value in the second operand and
959/// returns the result of the comparison in the low-order bits of a vector
960/// of [4 x float].
961///
962/// \headerfile <x86intrin.h>
963///
964/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
965/// instructions.
966///
967/// \param __a
968/// A 128-bit vector of [4 x float] containing one of the operands. The lower
969/// 32 bits of this operand are used in the comparison.
970/// \param __b
971/// A 128-bit vector of [4 x float] containing one of the operands. The lower
972/// 32 bits of this operand are used in the comparison.
973/// \returns A 128-bit vector of [4 x float] containing the comparison results
974/// in the low-order bits.
975static __inline__ __m128 __DEFAULT_FN_ATTRS
976_mm_cmpunord_ss(__m128 __a, __m128 __b)
977{
978 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
979}
980
981/// Compares each of the corresponding 32-bit float values of the
982/// 128-bit vectors of [4 x float] to determine if the values in the first
983/// operand are unordered with respect to those in the second operand.
984///
985/// \headerfile <x86intrin.h>
986///
987/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
988/// instructions.
989///
990/// \param __a
991/// A 128-bit vector of [4 x float].
992/// \param __b
993/// A 128-bit vector of [4 x float].
994/// \returns A 128-bit vector of [4 x float] containing the comparison results.
995static __inline__ __m128 __DEFAULT_FN_ATTRS
996_mm_cmpunord_ps(__m128 __a, __m128 __b)
997{
998 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
999}
1000
1001/// Compares two 32-bit float values in the low-order bits of both
1002/// operands for equality and returns the result of the comparison.
1003///
1004/// If either of the two lower 32-bit values is NaN, 0 is returned.
1005///
1006/// \headerfile <x86intrin.h>
1007///
1008/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1009/// instructions.
1010///
1011/// \param __a
1012/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1013/// used in the comparison.
1014/// \param __b
1015/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1016/// used in the comparison.
1017/// \returns An integer containing the comparison results. If either of the
1018/// two lower 32-bit values is NaN, 0 is returned.
1019static __inline__ int __DEFAULT_FN_ATTRS
1020_mm_comieq_ss(__m128 __a, __m128 __b)
1021{
1022 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1023}
1024
1025/// Compares two 32-bit float values in the low-order bits of both
1026/// operands to determine if the first operand is less than the second
1027/// operand and returns the result of the comparison.
1028///
1029/// If either of the two lower 32-bit values is NaN, 0 is returned.
1030///
1031/// \headerfile <x86intrin.h>
1032///
1033/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1034/// instructions.
1035///
1036/// \param __a
1037/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1038/// used in the comparison.
1039/// \param __b
1040/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1041/// used in the comparison.
1042/// \returns An integer containing the comparison results. If either of the two
1043/// lower 32-bit values is NaN, 0 is returned.
1044static __inline__ int __DEFAULT_FN_ATTRS
1045_mm_comilt_ss(__m128 __a, __m128 __b)
1046{
1047 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1048}
1049
1050/// Compares two 32-bit float values in the low-order bits of both
1051/// operands to determine if the first operand is less than or equal to the
1052/// second operand and returns the result of the comparison.
1053///
1054/// If either of the two lower 32-bit values is NaN, 0 is returned.
1055///
1056/// \headerfile <x86intrin.h>
1057///
1058/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1059///
1060/// \param __a
1061/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1062/// used in the comparison.
1063/// \param __b
1064/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1065/// used in the comparison.
1066/// \returns An integer containing the comparison results. If either of the two
1067/// lower 32-bit values is NaN, 0 is returned.
1068static __inline__ int __DEFAULT_FN_ATTRS
1069_mm_comile_ss(__m128 __a, __m128 __b)
1070{
1071 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1072}
1073
1074/// Compares two 32-bit float values in the low-order bits of both
1075/// operands to determine if the first operand is greater than the second
1076/// operand and returns the result of the comparison.
1077///
1078/// If either of the two lower 32-bit values is NaN, 0 is returned.
1079///
1080/// \headerfile <x86intrin.h>
1081///
1082/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1083///
1084/// \param __a
1085/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1086/// used in the comparison.
1087/// \param __b
1088/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1089/// used in the comparison.
1090/// \returns An integer containing the comparison results. If either of the
1091/// two lower 32-bit values is NaN, 0 is returned.
1092static __inline__ int __DEFAULT_FN_ATTRS
1093_mm_comigt_ss(__m128 __a, __m128 __b)
1094{
1095 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1096}
1097
1098/// Compares two 32-bit float values in the low-order bits of both
1099/// operands to determine if the first operand is greater than or equal to
1100/// the second operand and returns the result of the comparison.
1101///
1102/// If either of the two lower 32-bit values is NaN, 0 is returned.
1103///
1104/// \headerfile <x86intrin.h>
1105///
1106/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1107///
1108/// \param __a
1109/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1110/// used in the comparison.
1111/// \param __b
1112/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1113/// used in the comparison.
1114/// \returns An integer containing the comparison results. If either of the two
1115/// lower 32-bit values is NaN, 0 is returned.
1116static __inline__ int __DEFAULT_FN_ATTRS
1117_mm_comige_ss(__m128 __a, __m128 __b)
1118{
1119 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1120}
1121
1122/// Compares two 32-bit float values in the low-order bits of both
1123/// operands to determine if the first operand is not equal to the second
1124/// operand and returns the result of the comparison.
1125///
1126/// If either of the two lower 32-bit values is NaN, 1 is returned.
1127///
1128/// \headerfile <x86intrin.h>
1129///
1130/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1131///
1132/// \param __a
1133/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1134/// used in the comparison.
1135/// \param __b
1136/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1137/// used in the comparison.
1138/// \returns An integer containing the comparison results. If either of the
1139/// two lower 32-bit values is NaN, 1 is returned.
1140static __inline__ int __DEFAULT_FN_ATTRS
1141_mm_comineq_ss(__m128 __a, __m128 __b)
1142{
1143 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1144}
1145
1146/// Performs an unordered comparison of two 32-bit float values using
1147/// the low-order bits of both operands to determine equality and returns
1148/// the result of the comparison.
1149///
1150/// If either of the two lower 32-bit values is NaN, 0 is returned.
1151///
1152/// \headerfile <x86intrin.h>
1153///
1154/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1155///
1156/// \param __a
1157/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1158/// used in the comparison.
1159/// \param __b
1160/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1161/// used in the comparison.
1162/// \returns An integer containing the comparison results. If either of the two
1163/// lower 32-bit values is NaN, 0 is returned.
1164static __inline__ int __DEFAULT_FN_ATTRS
1165_mm_ucomieq_ss(__m128 __a, __m128 __b)
1166{
1167 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1168}
1169
1170/// Performs an unordered comparison of two 32-bit float values using
1171/// the low-order bits of both operands to determine if the first operand is
1172/// less than the second operand and returns the result of the comparison.
1173///
1174/// If either of the two lower 32-bit values is NaN, 0 is returned.
1175///
1176/// \headerfile <x86intrin.h>
1177///
1178/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1179///
1180/// \param __a
1181/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1182/// used in the comparison.
1183/// \param __b
1184/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1185/// used in the comparison.
1186/// \returns An integer containing the comparison results. If either of the two
1187/// lower 32-bit values is NaN, 0 is returned.
1188static __inline__ int __DEFAULT_FN_ATTRS
1189_mm_ucomilt_ss(__m128 __a, __m128 __b)
1190{
1191 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1192}
1193
1194/// Performs an unordered comparison of two 32-bit float values using
1195/// the low-order bits of both operands to determine if the first operand is
1196/// less than or equal to the second operand and returns the result of the
1197/// comparison.
1198///
1199/// If either of the two lower 32-bit values is NaN, 0 is returned.
1200///
1201/// \headerfile <x86intrin.h>
1202///
1203/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1204///
1205/// \param __a
1206/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1207/// used in the comparison.
1208/// \param __b
1209/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1210/// used in the comparison.
1211/// \returns An integer containing the comparison results. If either of the two
1212/// lower 32-bit values is NaN, 0 is returned.
1213static __inline__ int __DEFAULT_FN_ATTRS
1214_mm_ucomile_ss(__m128 __a, __m128 __b)
1215{
1216 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1217}
1218
1219/// Performs an unordered comparison of two 32-bit float values using
1220/// the low-order bits of both operands to determine if the first operand is
1221/// greater than the second operand and returns the result of the
1222/// comparison.
1223///
1224/// If either of the two lower 32-bit values is NaN, 0 is returned.
1225///
1226/// \headerfile <x86intrin.h>
1227///
1228/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1229///
1230/// \param __a
1231/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1232/// used in the comparison.
1233/// \param __b
1234/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235/// used in the comparison.
1236/// \returns An integer containing the comparison results. If either of the two
1237/// lower 32-bit values is NaN, 0 is returned.
1238static __inline__ int __DEFAULT_FN_ATTRS
1239_mm_ucomigt_ss(__m128 __a, __m128 __b)
1240{
1241 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1242}
1243
1244/// Performs an unordered comparison of two 32-bit float values using
1245/// the low-order bits of both operands to determine if the first operand is
1246/// greater than or equal to the second operand and returns the result of
1247/// the comparison.
1248///
1249/// If either of the two lower 32-bit values is NaN, 0 is returned.
1250///
1251/// \headerfile <x86intrin.h>
1252///
1253/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1254///
1255/// \param __a
1256/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1257/// used in the comparison.
1258/// \param __b
1259/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260/// used in the comparison.
1261/// \returns An integer containing the comparison results. If either of the two
1262/// lower 32-bit values is NaN, 0 is returned.
1263static __inline__ int __DEFAULT_FN_ATTRS
1264_mm_ucomige_ss(__m128 __a, __m128 __b)
1265{
1266 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1267}
1268
1269/// Performs an unordered comparison of two 32-bit float values using
1270/// the low-order bits of both operands to determine inequality and returns
1271/// the result of the comparison.
1272///
1273/// If either of the two lower 32-bit values is NaN, 1 is returned.
1274///
1275/// \headerfile <x86intrin.h>
1276///
1277/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1278///
1279/// \param __a
1280/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1281/// used in the comparison.
1282/// \param __b
1283/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1284/// used in the comparison.
1285/// \returns An integer containing the comparison results. If either of the two
1286/// lower 32-bit values is NaN, 1 is returned.
1287static __inline__ int __DEFAULT_FN_ATTRS
1288_mm_ucomineq_ss(__m128 __a, __m128 __b)
1289{
1290 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1291}
1292
1293/// Converts a float value contained in the lower 32 bits of a vector of
1294/// [4 x float] into a 32-bit integer.
1295///
1296/// \headerfile <x86intrin.h>
1297///
1298/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1299/// instructions.
1300///
1301/// \param __a
1302/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1303/// used in the conversion.
1304/// \returns A 32-bit integer containing the converted value.
1305static __inline__ int __DEFAULT_FN_ATTRS
1307{
1308 return __builtin_ia32_cvtss2si((__v4sf)__a);
1309}
1310
1311/// Converts a float value contained in the lower 32 bits of a vector of
1312/// [4 x float] into a 32-bit integer.
1313///
1314/// \headerfile <x86intrin.h>
1315///
1316/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1317/// instructions.
1318///
1319/// \param __a
1320/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1321/// used in the conversion.
1322/// \returns A 32-bit integer containing the converted value.
1323static __inline__ int __DEFAULT_FN_ATTRS
1325{
1326 return _mm_cvtss_si32(__a);
1327}
1328
1329#ifdef __x86_64__
1330
1331/// Converts a float value contained in the lower 32 bits of a vector of
1332/// [4 x float] into a 64-bit integer.
1333///
1334/// \headerfile <x86intrin.h>
1335///
1336/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1337/// instructions.
1338///
1339/// \param __a
1340/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1341/// used in the conversion.
1342/// \returns A 64-bit integer containing the converted value.
1343static __inline__ long long __DEFAULT_FN_ATTRS
1344_mm_cvtss_si64(__m128 __a)
1345{
1346 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1347}
1348
1349#endif
1350
1351/// Converts two low-order float values in a 128-bit vector of
1352/// [4 x float] into a 64-bit vector of [2 x i32].
1353///
1354/// \headerfile <x86intrin.h>
1355///
1356/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1357///
1358/// \param __a
1359/// A 128-bit vector of [4 x float].
1360/// \returns A 64-bit integer vector containing the converted values.
1361static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1363{
1364 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1365}
1366
1367/// Converts two low-order float values in a 128-bit vector of
1368/// [4 x float] into a 64-bit vector of [2 x i32].
1369///
1370/// \headerfile <x86intrin.h>
1371///
1372/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1373///
1374/// \param __a
1375/// A 128-bit vector of [4 x float].
1376/// \returns A 64-bit integer vector containing the converted values.
1377static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1379{
1380 return _mm_cvtps_pi32(__a);
1381}
1382
1383/// Converts a float value contained in the lower 32 bits of a vector of
1384/// [4 x float] into a 32-bit integer, truncating the result when it is
1385/// inexact.
1386///
1387/// \headerfile <x86intrin.h>
1388///
1389/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1390/// instructions.
1391///
1392/// \param __a
1393/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1394/// used in the conversion.
1395/// \returns A 32-bit integer containing the converted value.
1396static __inline__ int __DEFAULT_FN_ATTRS
1398{
1399 return __builtin_ia32_cvttss2si((__v4sf)__a);
1400}
1401
1402/// Converts a float value contained in the lower 32 bits of a vector of
1403/// [4 x float] into a 32-bit integer, truncating the result when it is
1404/// inexact.
1405///
1406/// \headerfile <x86intrin.h>
1407///
1408/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1409/// instructions.
1410///
1411/// \param __a
1412/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1413/// used in the conversion.
1414/// \returns A 32-bit integer containing the converted value.
1415static __inline__ int __DEFAULT_FN_ATTRS
1417{
1418 return _mm_cvttss_si32(__a);
1419}
1420
1421#ifdef __x86_64__
1422/// Converts a float value contained in the lower 32 bits of a vector of
1423/// [4 x float] into a 64-bit integer, truncating the result when it is
1424/// inexact.
1425///
1426/// \headerfile <x86intrin.h>
1427///
1428/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1429/// instructions.
1430///
1431/// \param __a
1432/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1433/// used in the conversion.
1434/// \returns A 64-bit integer containing the converted value.
1435static __inline__ long long __DEFAULT_FN_ATTRS
1436_mm_cvttss_si64(__m128 __a)
1437{
1438 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1439}
1440#endif
1441
1442/// Converts two low-order float values in a 128-bit vector of
1443/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1444/// when it is inexact.
1445///
1446/// \headerfile <x86intrin.h>
1447///
1448/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1449/// instructions.
1450///
1451/// \param __a
1452/// A 128-bit vector of [4 x float].
1453/// \returns A 64-bit integer vector containing the converted values.
1454static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1456{
1457 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1458}
1459
1460/// Converts two low-order float values in a 128-bit vector of [4 x
1461/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1462/// is inexact.
1463///
1464/// \headerfile <x86intrin.h>
1465///
1466/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1467///
1468/// \param __a
1469/// A 128-bit vector of [4 x float].
1470/// \returns A 64-bit integer vector containing the converted values.
1471static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1473{
1474 return _mm_cvttps_pi32(__a);
1475}
1476
1477/// Converts a 32-bit signed integer value into a floating point value
1478/// and writes it to the lower 32 bits of the destination. The remaining
1479/// higher order elements of the destination vector are copied from the
1480/// corresponding elements in the first operand.
1481///
1482/// \headerfile <x86intrin.h>
1483///
1484/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1485///
1486/// \param __a
1487/// A 128-bit vector of [4 x float].
1488/// \param __b
1489/// A 32-bit signed integer operand containing the value to be converted.
1490/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1491/// converted value of the second operand. The upper 96 bits are copied from
1492/// the upper 96 bits of the first operand.
1493static __inline__ __m128 __DEFAULT_FN_ATTRS
1495{
1496 __a[0] = __b;
1497 return __a;
1498}
1499
1500/// Converts a 32-bit signed integer value into a floating point value
1501/// and writes it to the lower 32 bits of the destination. The remaining
1502/// higher order elements of the destination are copied from the
1503/// corresponding elements in the first operand.
1504///
1505/// \headerfile <x86intrin.h>
1506///
1507/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1508///
1509/// \param __a
1510/// A 128-bit vector of [4 x float].
1511/// \param __b
1512/// A 32-bit signed integer operand containing the value to be converted.
1513/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1514/// converted value of the second operand. The upper 96 bits are copied from
1515/// the upper 96 bits of the first operand.
1516static __inline__ __m128 __DEFAULT_FN_ATTRS
1518{
1519 return _mm_cvtsi32_ss(__a, __b);
1520}
1521
1522#ifdef __x86_64__
1523
1524/// Converts a 64-bit signed integer value into a floating point value
1525/// and writes it to the lower 32 bits of the destination. The remaining
1526/// higher order elements of the destination are copied from the
1527/// corresponding elements in the first operand.
1528///
1529/// \headerfile <x86intrin.h>
1530///
1531/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1532///
1533/// \param __a
1534/// A 128-bit vector of [4 x float].
1535/// \param __b
1536/// A 64-bit signed integer operand containing the value to be converted.
1537/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1538/// converted value of the second operand. The upper 96 bits are copied from
1539/// the upper 96 bits of the first operand.
1540static __inline__ __m128 __DEFAULT_FN_ATTRS
1541_mm_cvtsi64_ss(__m128 __a, long long __b)
1542{
1543 __a[0] = __b;
1544 return __a;
1545}
1546
1547#endif
1548
1549/// Converts two elements of a 64-bit vector of [2 x i32] into two
1550/// floating point values and writes them to the lower 64-bits of the
1551/// destination. The remaining higher order elements of the destination are
1552/// copied from the corresponding elements in the first operand.
1553///
1554/// \headerfile <x86intrin.h>
1555///
1556/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1557///
1558/// \param __a
1559/// A 128-bit vector of [4 x float].
1560/// \param __b
1561/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1562/// and written to the corresponding low-order elements in the destination.
1563/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1564/// converted value of the second operand. The upper 64 bits are copied from
1565/// the upper 64 bits of the first operand.
1566static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1567_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1568{
1569 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1570}
1571
1572/// Converts two elements of a 64-bit vector of [2 x i32] into two
1573/// floating point values and writes them to the lower 64-bits of the
1574/// destination. The remaining higher order elements of the destination are
1575/// copied from the corresponding elements in the first operand.
1576///
1577/// \headerfile <x86intrin.h>
1578///
1579/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1580///
1581/// \param __a
1582/// A 128-bit vector of [4 x float].
1583/// \param __b
1584/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1585/// and written to the corresponding low-order elements in the destination.
1586/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1587/// converted value from the second operand. The upper 64 bits are copied
1588/// from the upper 64 bits of the first operand.
1589static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1590_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1591{
1592 return _mm_cvtpi32_ps(__a, __b);
1593}
1594
1595/// Extracts a float value contained in the lower 32 bits of a vector of
1596/// [4 x float].
1597///
1598/// \headerfile <x86intrin.h>
1599///
1600/// This intrinsic has no corresponding instruction.
1601///
1602/// \param __a
1603/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1604/// used in the extraction.
1605/// \returns A 32-bit float containing the extracted value.
1606static __inline__ float __DEFAULT_FN_ATTRS
1608{
1609 return __a[0];
1610}
1611
1612/// Loads two packed float values from the address \a __p into the
1613/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1614/// are copied from the low-order bits of the first operand.
1615///
1616/// \headerfile <x86intrin.h>
1617///
1618/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1619///
1620/// \param __a
1621/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1622/// of the destination.
1623/// \param __p
1624/// A pointer to two packed float values. Bits [63:0] are written to bits
1625/// [127:64] of the destination.
1626/// \returns A 128-bit vector of [4 x float] containing the moved values.
1627static __inline__ __m128 __DEFAULT_FN_ATTRS
1628_mm_loadh_pi(__m128 __a, const __m64 *__p)
1629{
1630 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1631 struct __mm_loadh_pi_struct {
1632 __mm_loadh_pi_v2f32 __u;
1633 } __attribute__((__packed__, __may_alias__));
1634 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1635 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1636 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1637}
1638
1639/// Loads two packed float values from the address \a __p into the
1640/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1641/// are copied from the high-order bits of the first operand.
1642///
1643/// \headerfile <x86intrin.h>
1644///
1645/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1646///
1647/// \param __a
1648/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1649/// [127:64] of the destination.
1650/// \param __p
1651/// A pointer to two packed float values. Bits [63:0] are written to bits
1652/// [63:0] of the destination.
1653/// \returns A 128-bit vector of [4 x float] containing the moved values.
1654static __inline__ __m128 __DEFAULT_FN_ATTRS
1655_mm_loadl_pi(__m128 __a, const __m64 *__p)
1656{
1657 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1658 struct __mm_loadl_pi_struct {
1659 __mm_loadl_pi_v2f32 __u;
1660 } __attribute__((__packed__, __may_alias__));
1661 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1662 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1663 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1664}
1665
1666/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1667/// 32 bits of the vector are initialized with the single-precision
1668/// floating-point value loaded from a specified memory location. The upper
1669/// 96 bits are set to zero.
1670///
1671/// \headerfile <x86intrin.h>
1672///
1673/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1674///
1675/// \param __p
1676/// A pointer to a 32-bit memory location containing a single-precision
1677/// floating-point value.
1678/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1679/// lower 32 bits contain the value loaded from the memory location. The
1680/// upper 96 bits are set to zero.
1681static __inline__ __m128 __DEFAULT_FN_ATTRS
1682_mm_load_ss(const float *__p)
1683{
1684 struct __mm_load_ss_struct {
1685 float __u;
1686 } __attribute__((__packed__, __may_alias__));
1687 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1688 return __extension__ (__m128){ __u, 0, 0, 0 };
1689}
1690
1691/// Loads a 32-bit float value and duplicates it to all four vector
1692/// elements of a 128-bit vector of [4 x float].
1693///
1694/// \headerfile <x86intrin.h>
1695///
1696/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1697/// instruction.
1698///
1699/// \param __p
1700/// A pointer to a float value to be loaded and duplicated.
1701/// \returns A 128-bit vector of [4 x float] containing the loaded and
1702/// duplicated values.
1703static __inline__ __m128 __DEFAULT_FN_ATTRS
1704_mm_load1_ps(const float *__p)
1705{
1706 struct __mm_load1_ps_struct {
1707 float __u;
1708 } __attribute__((__packed__, __may_alias__));
1709 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1710 return __extension__ (__m128){ __u, __u, __u, __u };
1711}
1712
1713#define _mm_load_ps1(p) _mm_load1_ps(p)
1714
1715/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1716/// memory location.
1717///
1718/// \headerfile <x86intrin.h>
1719///
1720/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1721///
1722/// \param __p
1723/// A pointer to a 128-bit memory location. The address of the memory
1724/// location has to be 128-bit aligned.
1725/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1726static __inline__ __m128 __DEFAULT_FN_ATTRS
1727_mm_load_ps(const float *__p)
1728{
1729 return *(const __m128*)__p;
1730}
1731
1732/// Loads a 128-bit floating-point vector of [4 x float] from an
1733/// unaligned memory location.
1734///
1735/// \headerfile <x86intrin.h>
1736///
1737/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1738///
1739/// \param __p
1740/// A pointer to a 128-bit memory location. The address of the memory
1741/// location does not have to be aligned.
1742/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1743static __inline__ __m128 __DEFAULT_FN_ATTRS
1744_mm_loadu_ps(const float *__p)
1745{
1746 struct __loadu_ps {
1747 __m128_u __v;
1748 } __attribute__((__packed__, __may_alias__));
1749 return ((const struct __loadu_ps*)__p)->__v;
1750}
1751
1752/// Loads four packed float values, in reverse order, from an aligned
1753/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1754///
1755/// \headerfile <x86intrin.h>
1756///
1757/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1758/// instruction.
1759///
1760/// \param __p
1761/// A pointer to a 128-bit memory location. The address of the memory
1762/// location has to be 128-bit aligned.
1763/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1764/// in reverse order.
1765static __inline__ __m128 __DEFAULT_FN_ATTRS
1766_mm_loadr_ps(const float *__p)
1767{
1768 __m128 __a = _mm_load_ps(__p);
1769 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1770}
1771
1772/// Create a 128-bit vector of [4 x float] with undefined values.
1773///
1774/// \headerfile <x86intrin.h>
1775///
1776/// This intrinsic has no corresponding instruction.
1777///
1778/// \returns A 128-bit vector of [4 x float] containing undefined values.
1779static __inline__ __m128 __DEFAULT_FN_ATTRS
1781{
1782 return (__m128)__builtin_ia32_undef128();
1783}
1784
1785/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1786/// 32 bits of the vector are initialized with the specified single-precision
1787/// floating-point value. The upper 96 bits are set to zero.
1788///
1789/// \headerfile <x86intrin.h>
1790///
1791/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1792///
1793/// \param __w
1794/// A single-precision floating-point value used to initialize the lower 32
1795/// bits of the result.
1796/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1797/// lower 32 bits contain the value provided in the source operand. The
1798/// upper 96 bits are set to zero.
1799static __inline__ __m128 __DEFAULT_FN_ATTRS
1800_mm_set_ss(float __w)
1801{
1802 return __extension__ (__m128){ __w, 0, 0, 0 };
1803}
1804
1805/// Constructs a 128-bit floating-point vector of [4 x float], with each
1806/// of the four single-precision floating-point vector elements set to the
1807/// specified single-precision floating-point value.
1808///
1809/// \headerfile <x86intrin.h>
1810///
1811/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1812///
1813/// \param __w
1814/// A single-precision floating-point value used to initialize each vector
1815/// element of the result.
1816/// \returns An initialized 128-bit floating-point vector of [4 x float].
1817static __inline__ __m128 __DEFAULT_FN_ATTRS
1818_mm_set1_ps(float __w)
1819{
1820 return __extension__ (__m128){ __w, __w, __w, __w };
1821}
1822
1823/* Microsoft specific. */
1824/// Constructs a 128-bit floating-point vector of [4 x float], with each
1825/// of the four single-precision floating-point vector elements set to the
1826/// specified single-precision floating-point value.
1827///
1828/// \headerfile <x86intrin.h>
1829///
1830/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1831///
1832/// \param __w
1833/// A single-precision floating-point value used to initialize each vector
1834/// element of the result.
1835/// \returns An initialized 128-bit floating-point vector of [4 x float].
1836static __inline__ __m128 __DEFAULT_FN_ATTRS
1837_mm_set_ps1(float __w)
1838{
1839 return _mm_set1_ps(__w);
1840}
1841
1842/// Constructs a 128-bit floating-point vector of [4 x float]
1843/// initialized with the specified single-precision floating-point values.
1844///
1845/// \headerfile <x86intrin.h>
1846///
1847/// This intrinsic is a utility function and does not correspond to a specific
1848/// instruction.
1849///
1850/// \param __z
1851/// A single-precision floating-point value used to initialize bits [127:96]
1852/// of the result.
1853/// \param __y
1854/// A single-precision floating-point value used to initialize bits [95:64]
1855/// of the result.
1856/// \param __x
1857/// A single-precision floating-point value used to initialize bits [63:32]
1858/// of the result.
1859/// \param __w
1860/// A single-precision floating-point value used to initialize bits [31:0]
1861/// of the result.
1862/// \returns An initialized 128-bit floating-point vector of [4 x float].
1863static __inline__ __m128 __DEFAULT_FN_ATTRS
1864_mm_set_ps(float __z, float __y, float __x, float __w)
1865{
1866 return __extension__ (__m128){ __w, __x, __y, __z };
1867}
1868
1869/// Constructs a 128-bit floating-point vector of [4 x float],
1870/// initialized in reverse order with the specified 32-bit single-precision
1871/// float-point values.
1872///
1873/// \headerfile <x86intrin.h>
1874///
1875/// This intrinsic is a utility function and does not correspond to a specific
1876/// instruction.
1877///
1878/// \param __z
1879/// A single-precision floating-point value used to initialize bits [31:0]
1880/// of the result.
1881/// \param __y
1882/// A single-precision floating-point value used to initialize bits [63:32]
1883/// of the result.
1884/// \param __x
1885/// A single-precision floating-point value used to initialize bits [95:64]
1886/// of the result.
1887/// \param __w
1888/// A single-precision floating-point value used to initialize bits [127:96]
1889/// of the result.
1890/// \returns An initialized 128-bit floating-point vector of [4 x float].
1891static __inline__ __m128 __DEFAULT_FN_ATTRS
1892_mm_setr_ps(float __z, float __y, float __x, float __w)
1893{
1894 return __extension__ (__m128){ __z, __y, __x, __w };
1895}
1896
1897/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1898/// to zero.
1899///
1900/// \headerfile <x86intrin.h>
1901///
1902/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1903///
1904/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1905/// all elements set to zero.
1906static __inline__ __m128 __DEFAULT_FN_ATTRS
1908{
1909 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
1910}
1911
1912/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1913/// memory location.
1914///
1915/// \headerfile <x86intrin.h>
1916///
1917/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1918///
1919/// \param __p
1920/// A pointer to a 64-bit memory location.
1921/// \param __a
1922/// A 128-bit vector of [4 x float] containing the values to be stored.
1923static __inline__ void __DEFAULT_FN_ATTRS
1924_mm_storeh_pi(__m64 *__p, __m128 __a)
1925{
1926 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1927 struct __mm_storeh_pi_struct {
1928 __mm_storeh_pi_v2f32 __u;
1929 } __attribute__((__packed__, __may_alias__));
1930 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1931}
1932
1933/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1934/// memory location.
1935///
1936/// \headerfile <x86intrin.h>
1937///
1938/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1939///
1940/// \param __p
1941/// A pointer to a memory location that will receive the float values.
1942/// \param __a
1943/// A 128-bit vector of [4 x float] containing the values to be stored.
1944static __inline__ void __DEFAULT_FN_ATTRS
1945_mm_storel_pi(__m64 *__p, __m128 __a)
1946{
1947 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1948 struct __mm_storeh_pi_struct {
1949 __mm_storeh_pi_v2f32 __u;
1950 } __attribute__((__packed__, __may_alias__));
1951 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1952}
1953
1954/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1955/// memory location.
1956///
1957/// \headerfile <x86intrin.h>
1958///
1959/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1960///
1961/// \param __p
1962/// A pointer to a 32-bit memory location.
1963/// \param __a
1964/// A 128-bit vector of [4 x float] containing the value to be stored.
1965static __inline__ void __DEFAULT_FN_ATTRS
1966_mm_store_ss(float *__p, __m128 __a)
1967{
1968 struct __mm_store_ss_struct {
1969 float __u;
1970 } __attribute__((__packed__, __may_alias__));
1971 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1972}
1973
1974/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1975/// location.
1976///
1977/// \headerfile <x86intrin.h>
1978///
1979/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1980///
1981/// \param __p
1982/// A pointer to a 128-bit memory location. The address of the memory
1983/// location does not have to be aligned.
1984/// \param __a
1985/// A 128-bit vector of [4 x float] containing the values to be stored.
1986static __inline__ void __DEFAULT_FN_ATTRS
1987_mm_storeu_ps(float *__p, __m128 __a)
1988{
1989 struct __storeu_ps {
1990 __m128_u __v;
1991 } __attribute__((__packed__, __may_alias__));
1992 ((struct __storeu_ps*)__p)->__v = __a;
1993}
1994
1995/// Stores a 128-bit vector of [4 x float] into an aligned memory
1996/// location.
1997///
1998/// \headerfile <x86intrin.h>
1999///
2000/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2001///
2002/// \param __p
2003/// A pointer to a 128-bit memory location. The address of the memory
2004/// location has to be 16-byte aligned.
2005/// \param __a
2006/// A 128-bit vector of [4 x float] containing the values to be stored.
2007static __inline__ void __DEFAULT_FN_ATTRS
2008_mm_store_ps(float *__p, __m128 __a)
2009{
2010 *(__m128*)__p = __a;
2011}
2012
2013/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2014/// four contiguous elements in an aligned memory location.
2015///
2016/// \headerfile <x86intrin.h>
2017///
2018/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2019/// instruction.
2020///
2021/// \param __p
2022/// A pointer to a 128-bit memory location.
2023/// \param __a
2024/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2025/// of the four contiguous elements pointed by \a __p.
2026static __inline__ void __DEFAULT_FN_ATTRS
2027_mm_store1_ps(float *__p, __m128 __a)
2028{
2029 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2031}
2032
2033/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2034/// four contiguous elements in an aligned memory location.
2035///
2036/// \headerfile <x86intrin.h>
2037///
2038/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2039/// instruction.
2040///
2041/// \param __p
2042/// A pointer to a 128-bit memory location.
2043/// \param __a
2044/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2045/// of the four contiguous elements pointed by \a __p.
2046static __inline__ void __DEFAULT_FN_ATTRS
2047_mm_store_ps1(float *__p, __m128 __a)
2048{
2050}
2051
2052/// Stores float values from a 128-bit vector of [4 x float] to an
2053/// aligned memory location in reverse order.
2054///
2055/// \headerfile <x86intrin.h>
2056///
2057/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2058/// instruction.
2059///
2060/// \param __p
2061/// A pointer to a 128-bit memory location. The address of the memory
2062/// location has to be 128-bit aligned.
2063/// \param __a
2064/// A 128-bit vector of [4 x float] containing the values to be stored.
2065static __inline__ void __DEFAULT_FN_ATTRS
2066_mm_storer_ps(float *__p, __m128 __a)
2067{
2068 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2070}
2071
2072#define _MM_HINT_ET0 7
2073#define _MM_HINT_ET1 6
2074#define _MM_HINT_T0 3
2075#define _MM_HINT_T1 2
2076#define _MM_HINT_T2 1
2077#define _MM_HINT_NTA 0
2078
2079#ifndef _MSC_VER
2080/* FIXME: We have to #define this because "sel" must be a constant integer, and
2081 Sema doesn't do any form of constant propagation yet. */
2082
2083/// Loads one cache line of data from the specified address to a location
2084/// closer to the processor.
2085///
2086/// \headerfile <x86intrin.h>
2087///
2088/// \code
2089/// void _mm_prefetch(const void *a, const int sel);
2090/// \endcode
2091///
2092/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2093///
2094/// \param a
2095/// A pointer to a memory location containing a cache line of data.
2096/// \param sel
2097/// A predefined integer constant specifying the type of prefetch
2098/// operation: \n
2099/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2100/// PREFETCHNTA instruction will be generated. \n
2101/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2102/// be generated. \n
2103/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2104/// be generated. \n
2105/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2106/// be generated.
2107#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2108 ((sel) >> 2) & 1, (sel) & 0x3))
2109#endif
2110
2111/// Stores a 64-bit integer in the specified aligned memory location. To
2112/// minimize caching, the data is flagged as non-temporal (unlikely to be
2113/// used again soon).
2114///
2115/// \headerfile <x86intrin.h>
2116///
2117/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2118///
2119/// \param __p
2120/// A pointer to an aligned memory location used to store the register value.
2121/// \param __a
2122/// A 64-bit integer containing the value to be stored.
2123static __inline__ void __DEFAULT_FN_ATTRS_MMX
2124_mm_stream_pi(void *__p, __m64 __a)
2125{
2126 __builtin_ia32_movntq((__m64 *)__p, __a);
2127}
2128
2129/// Moves packed float values from a 128-bit vector of [4 x float] to a
2130/// 128-bit aligned memory location. To minimize caching, the data is flagged
2131/// as non-temporal (unlikely to be used again soon).
2132///
2133/// \headerfile <x86intrin.h>
2134///
2135/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2136///
2137/// \param __p
2138/// A pointer to a 128-bit aligned memory location that will receive the
2139/// single-precision floating-point values.
2140/// \param __a
2141/// A 128-bit vector of [4 x float] containing the values to be moved.
2142static __inline__ void __DEFAULT_FN_ATTRS
2143_mm_stream_ps(void *__p, __m128 __a)
2144{
2145 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2146}
2147
2148#if defined(__cplusplus)
2149extern "C" {
2150#endif
2151
2152/// Forces strong memory ordering (serialization) between store
2153/// instructions preceding this instruction and store instructions following
2154/// this instruction, ensuring the system completes all previous stores
2155/// before executing subsequent stores.
2156///
2157/// \headerfile <x86intrin.h>
2158///
2159/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2160///
2161void _mm_sfence(void);
2162
2163#if defined(__cplusplus)
2164} // extern "C"
2165#endif
2166
2167/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2168/// returns it, as specified by the immediate integer operand.
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// \code
2173/// int _mm_extract_pi16(__m64 a, int n);
2174/// \endcode
2175///
2176/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2177///
2178/// \param a
2179/// A 64-bit vector of [4 x i16].
2180/// \param n
2181/// An immediate integer operand that determines which bits are extracted: \n
2182/// 0: Bits [15:0] are copied to the destination. \n
2183/// 1: Bits [31:16] are copied to the destination. \n
2184/// 2: Bits [47:32] are copied to the destination. \n
2185/// 3: Bits [63:48] are copied to the destination.
2186/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2187#define _mm_extract_pi16(a, n) \
2188 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2189
2190/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2191/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2192/// specified by the immediate operand \a n.
2193///
2194/// \headerfile <x86intrin.h>
2195///
2196/// \code
2197/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2198/// \endcode
2199///
2200/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2201///
2202/// \param a
2203/// A 64-bit vector of [4 x i16].
2204/// \param d
2205/// An integer. The lower 16-bit value from this operand is written to the
2206/// destination at the offset specified by operand \a n.
2207/// \param n
2208/// An immediate integer operant that determines which the bits to be used
2209/// in the destination. \n
2210/// 0: Bits [15:0] are copied to the destination. \n
2211/// 1: Bits [31:16] are copied to the destination. \n
2212/// 2: Bits [47:32] are copied to the destination. \n
2213/// 3: Bits [63:48] are copied to the destination. \n
2214/// The remaining bits in the destination are copied from the corresponding
2215/// bits in operand \a a.
2216/// \returns A 64-bit integer vector containing the copied packed data from the
2217/// operands.
2218#define _mm_insert_pi16(a, d, n) \
2219 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2220
2221/// Compares each of the corresponding packed 16-bit integer values of
2222/// the 64-bit integer vectors, and writes the greater value to the
2223/// corresponding bits in the destination.
2224///
2225/// \headerfile <x86intrin.h>
2226///
2227/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2228///
2229/// \param __a
2230/// A 64-bit integer vector containing one of the source operands.
2231/// \param __b
2232/// A 64-bit integer vector containing one of the source operands.
2233/// \returns A 64-bit integer vector containing the comparison results.
2234static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2235_mm_max_pi16(__m64 __a, __m64 __b)
2236{
2237 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2238}
2239
2240/// Compares each of the corresponding packed 8-bit unsigned integer
2241/// values of the 64-bit integer vectors, and writes the greater value to the
2242/// corresponding bits in the destination.
2243///
2244/// \headerfile <x86intrin.h>
2245///
2246/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2247///
2248/// \param __a
2249/// A 64-bit integer vector containing one of the source operands.
2250/// \param __b
2251/// A 64-bit integer vector containing one of the source operands.
2252/// \returns A 64-bit integer vector containing the comparison results.
2253static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2254_mm_max_pu8(__m64 __a, __m64 __b)
2255{
2256 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2257}
2258
2259/// Compares each of the corresponding packed 16-bit integer values of
2260/// the 64-bit integer vectors, and writes the lesser value to the
2261/// corresponding bits in the destination.
2262///
2263/// \headerfile <x86intrin.h>
2264///
2265/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2266///
2267/// \param __a
2268/// A 64-bit integer vector containing one of the source operands.
2269/// \param __b
2270/// A 64-bit integer vector containing one of the source operands.
2271/// \returns A 64-bit integer vector containing the comparison results.
2272static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2273_mm_min_pi16(__m64 __a, __m64 __b)
2274{
2275 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2276}
2277
2278/// Compares each of the corresponding packed 8-bit unsigned integer
2279/// values of the 64-bit integer vectors, and writes the lesser value to the
2280/// corresponding bits in the destination.
2281///
2282/// \headerfile <x86intrin.h>
2283///
2284/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2285///
2286/// \param __a
2287/// A 64-bit integer vector containing one of the source operands.
2288/// \param __b
2289/// A 64-bit integer vector containing one of the source operands.
2290/// \returns A 64-bit integer vector containing the comparison results.
2291static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2292_mm_min_pu8(__m64 __a, __m64 __b)
2293{
2294 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2295}
2296
2297/// Takes the most significant bit from each 8-bit element in a 64-bit
2298/// integer vector to create an 8-bit mask value. Zero-extends the value to
2299/// 32-bit integer and writes it to the destination.
2300///
2301/// \headerfile <x86intrin.h>
2302///
2303/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2304///
2305/// \param __a
2306/// A 64-bit integer vector containing the values with bits to be extracted.
2307/// \returns The most significant bit from each 8-bit element in \a __a,
2308/// written to bits [7:0].
2309static __inline__ int __DEFAULT_FN_ATTRS_MMX
2311{
2312 return __builtin_ia32_pmovmskb((__v8qi)__a);
2313}
2314
2315/// Multiplies packed 16-bit unsigned integer values and writes the
2316/// high-order 16 bits of each 32-bit product to the corresponding bits in
2317/// the destination.
2318///
2319/// \headerfile <x86intrin.h>
2320///
2321/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2322///
2323/// \param __a
2324/// A 64-bit integer vector containing one of the source operands.
2325/// \param __b
2326/// A 64-bit integer vector containing one of the source operands.
2327/// \returns A 64-bit integer vector containing the products of both operands.
2328static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2330{
2331 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2332}
2333
2334/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2335/// destination, as specified by the immediate value operand.
2336///
2337/// \headerfile <x86intrin.h>
2338///
2339/// \code
2340/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2341/// \endcode
2342///
2343/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2344///
2345/// \param a
2346/// A 64-bit integer vector containing the values to be shuffled.
2347/// \param n
2348/// An immediate value containing an 8-bit value specifying which elements to
2349/// copy from \a a. The destinations within the 64-bit destination are
2350/// assigned values as follows: \n
2351/// Bits [1:0] are used to assign values to bits [15:0] in the
2352/// destination. \n
2353/// Bits [3:2] are used to assign values to bits [31:16] in the
2354/// destination. \n
2355/// Bits [5:4] are used to assign values to bits [47:32] in the
2356/// destination. \n
2357/// Bits [7:6] are used to assign values to bits [63:48] in the
2358/// destination. \n
2359/// Bit value assignments: \n
2360/// 00: assigned from bits [15:0] of \a a. \n
2361/// 01: assigned from bits [31:16] of \a a. \n
2362/// 10: assigned from bits [47:32] of \a a. \n
2363/// 11: assigned from bits [63:48] of \a a. \n
2364/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2365/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2366/// <c>[b6, b4, b2, b0]</c>.
2367/// \returns A 64-bit integer vector containing the shuffled values.
2368#define _mm_shuffle_pi16(a, n) \
2369 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2370
2371/// Conditionally copies the values from each 8-bit element in the first
2372/// 64-bit integer vector operand to the specified memory location, as
2373/// specified by the most significant bit in the corresponding element in the
2374/// second 64-bit integer vector operand.
2375///
2376/// To minimize caching, the data is flagged as non-temporal
2377/// (unlikely to be used again soon).
2378///
2379/// \headerfile <x86intrin.h>
2380///
2381/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2382///
2383/// \param __d
2384/// A 64-bit integer vector containing the values with elements to be copied.
2385/// \param __n
2386/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2387/// element determines whether the corresponding element in operand \a __d
2388/// is copied. If the most significant bit of a given element is 1, the
2389/// corresponding element in operand \a __d is copied.
2390/// \param __p
2391/// A pointer to a 64-bit memory location that will receive the conditionally
2392/// copied integer values. The address of the memory location does not have
2393/// to be aligned.
2394static __inline__ void __DEFAULT_FN_ATTRS_MMX
2395_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2396{
2397 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2398}
2399
2400/// Computes the rounded averages of the packed unsigned 8-bit integer
2401/// values and writes the averages to the corresponding bits in the
2402/// destination.
2403///
2404/// \headerfile <x86intrin.h>
2405///
2406/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2407///
2408/// \param __a
2409/// A 64-bit integer vector containing one of the source operands.
2410/// \param __b
2411/// A 64-bit integer vector containing one of the source operands.
2412/// \returns A 64-bit integer vector containing the averages of both operands.
2413static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2414_mm_avg_pu8(__m64 __a, __m64 __b)
2415{
2416 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2417}
2418
2419/// Computes the rounded averages of the packed unsigned 16-bit integer
2420/// values and writes the averages to the corresponding bits in the
2421/// destination.
2422///
2423/// \headerfile <x86intrin.h>
2424///
2425/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2426///
2427/// \param __a
2428/// A 64-bit integer vector containing one of the source operands.
2429/// \param __b
2430/// A 64-bit integer vector containing one of the source operands.
2431/// \returns A 64-bit integer vector containing the averages of both operands.
2432static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2433_mm_avg_pu16(__m64 __a, __m64 __b)
2434{
2435 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2436}
2437
2438/// Subtracts the corresponding 8-bit unsigned integer values of the two
2439/// 64-bit vector operands and computes the absolute value for each of the
2440/// difference. Then sum of the 8 absolute differences is written to the
2441/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2442///
2443/// \headerfile <x86intrin.h>
2444///
2445/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2446///
2447/// \param __a
2448/// A 64-bit integer vector containing one of the source operands.
2449/// \param __b
2450/// A 64-bit integer vector containing one of the source operands.
2451/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2452/// sets of absolute differences between both operands. The upper bits are
2453/// cleared.
2454static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2455_mm_sad_pu8(__m64 __a, __m64 __b)
2456{
2457 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2458}
2459
2460#if defined(__cplusplus)
2461extern "C" {
2462#endif
2463
2464/// Returns the contents of the MXCSR register as a 32-bit unsigned
2465/// integer value.
2466///
2467/// There are several groups of macros associated with this
2468/// intrinsic, including:
2469/// <ul>
2470/// <li>
2471/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2472/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2473/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2474/// _MM_GET_EXCEPTION_STATE().
2475/// </li>
2476/// <li>
2477/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2478/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2479/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2480/// </li>
2481/// <li>
2482/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2483/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2484/// _MM_GET_ROUNDING_MODE().
2485/// </li>
2486/// <li>
2487/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2488/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2489/// </li>
2490/// <li>
2491/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2492/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2493/// _MM_GET_DENORMALS_ZERO_MODE().
2494/// </li>
2495/// </ul>
2496///
2497/// For example, the following expression checks if an overflow exception has
2498/// occurred:
2499/// \code
2500/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2501/// \endcode
2502///
2503/// The following expression gets the current rounding mode:
2504/// \code
2505/// _MM_GET_ROUNDING_MODE()
2506/// \endcode
2507///
2508/// \headerfile <x86intrin.h>
2509///
2510/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2511///
2512/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2513/// register.
2514unsigned int _mm_getcsr(void);
2515
2516/// Sets the MXCSR register with the 32-bit unsigned integer value.
2517///
2518/// There are several groups of macros associated with this intrinsic,
2519/// including:
2520/// <ul>
2521/// <li>
2522/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2523/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2524/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2525/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2526/// </li>
2527/// <li>
2528/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2529/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2530/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2531/// of these macros.
2532/// </li>
2533/// <li>
2534/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2535/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2536/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2537/// </li>
2538/// <li>
2539/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2540/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2541/// one of these macros.
2542/// </li>
2543/// <li>
2544/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2545/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2546/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2547/// </li>
2548/// </ul>
2549///
2550/// For example, the following expression causes subsequent floating-point
2551/// operations to round up:
2552/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2553///
2554/// The following example sets the DAZ and FTZ flags:
2555/// \code
2556/// void setFlags() {
2557/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2558/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2559/// }
2560/// \endcode
2561///
2562/// \headerfile <x86intrin.h>
2563///
2564/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2565///
2566/// \param __i
2567/// A 32-bit unsigned integer value to be written to the MXCSR register.
2568void _mm_setcsr(unsigned int __i);
2569
2570#if defined(__cplusplus)
2571} // extern "C"
2572#endif
2573
2574/// Selects 4 float values from the 128-bit operands of [4 x float], as
2575/// specified by the immediate value operand.
2576///
2577/// \headerfile <x86intrin.h>
2578///
2579/// \code
2580/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2581/// \endcode
2582///
2583/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2584///
2585/// \param a
2586/// A 128-bit vector of [4 x float].
2587/// \param b
2588/// A 128-bit vector of [4 x float].
2589/// \param mask
2590/// An immediate value containing an 8-bit value specifying which elements to
2591/// copy from \a a and \a b. \n
2592/// Bits [3:0] specify the values copied from operand \a a. \n
2593/// Bits [7:4] specify the values copied from operand \a b. \n
2594/// The destinations within the 128-bit destination are assigned values as
2595/// follows: \n
2596/// Bits [1:0] are used to assign values to bits [31:0] in the
2597/// destination. \n
2598/// Bits [3:2] are used to assign values to bits [63:32] in the
2599/// destination. \n
2600/// Bits [5:4] are used to assign values to bits [95:64] in the
2601/// destination. \n
2602/// Bits [7:6] are used to assign values to bits [127:96] in the
2603/// destination. \n
2604/// Bit value assignments: \n
2605/// 00: Bits [31:0] copied from the specified operand. \n
2606/// 01: Bits [63:32] copied from the specified operand. \n
2607/// 10: Bits [95:64] copied from the specified operand. \n
2608/// 11: Bits [127:96] copied from the specified operand. \n
2609/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2610/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2611/// <c>[b6, b4, b2, b0]</c>.
2612/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2613#define _mm_shuffle_ps(a, b, mask) \
2614 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2615 (int)(mask)))
2616
2617/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2618/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2619///
2620/// \headerfile <x86intrin.h>
2621///
2622/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2623///
2624/// \param __a
2625/// A 128-bit vector of [4 x float]. \n
2626/// Bits [95:64] are written to bits [31:0] of the destination. \n
2627/// Bits [127:96] are written to bits [95:64] of the destination.
2628/// \param __b
2629/// A 128-bit vector of [4 x float].
2630/// Bits [95:64] are written to bits [63:32] of the destination. \n
2631/// Bits [127:96] are written to bits [127:96] of the destination.
2632/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2633static __inline__ __m128 __DEFAULT_FN_ATTRS
2634_mm_unpackhi_ps(__m128 __a, __m128 __b)
2635{
2636 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2637}
2638
2639/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2640/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2641///
2642/// \headerfile <x86intrin.h>
2643///
2644/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2645///
2646/// \param __a
2647/// A 128-bit vector of [4 x float]. \n
2648/// Bits [31:0] are written to bits [31:0] of the destination. \n
2649/// Bits [63:32] are written to bits [95:64] of the destination.
2650/// \param __b
2651/// A 128-bit vector of [4 x float]. \n
2652/// Bits [31:0] are written to bits [63:32] of the destination. \n
2653/// Bits [63:32] are written to bits [127:96] of the destination.
2654/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2655static __inline__ __m128 __DEFAULT_FN_ATTRS
2656_mm_unpacklo_ps(__m128 __a, __m128 __b)
2657{
2658 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2659}
2660
2661/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2662/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2663/// 96 bits are set to the upper 96 bits of the first parameter.
2664///
2665/// \headerfile <x86intrin.h>
2666///
2667/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2668/// instruction.
2669///
2670/// \param __a
2671/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2672/// written to the upper 96 bits of the result.
2673/// \param __b
2674/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2675/// written to the lower 32 bits of the result.
2676/// \returns A 128-bit floating-point vector of [4 x float].
2677static __inline__ __m128 __DEFAULT_FN_ATTRS
2678_mm_move_ss(__m128 __a, __m128 __b)
2679{
2680 __a[0] = __b[0];
2681 return __a;
2682}
2683
2684/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2685/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2686/// 64 bits are set to the upper 64 bits of the first parameter.
2687///
2688/// \headerfile <x86intrin.h>
2689///
2690/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2691///
2692/// \param __a
2693/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2694/// written to the upper 64 bits of the result.
2695/// \param __b
2696/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2697/// written to the lower 64 bits of the result.
2698/// \returns A 128-bit floating-point vector of [4 x float].
2699static __inline__ __m128 __DEFAULT_FN_ATTRS
2700_mm_movehl_ps(__m128 __a, __m128 __b)
2701{
2702 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2703}
2704
2705/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2706/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2707/// 64 bits are set to the lower 64 bits of the second parameter.
2708///
2709/// \headerfile <x86intrin.h>
2710///
2711/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2712///
2713/// \param __a
2714/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2715/// written to the lower 64 bits of the result.
2716/// \param __b
2717/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2718/// written to the upper 64 bits of the result.
2719/// \returns A 128-bit floating-point vector of [4 x float].
2720static __inline__ __m128 __DEFAULT_FN_ATTRS
2721_mm_movelh_ps(__m128 __a, __m128 __b)
2722{
2723 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2724}
2725
2726/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2727/// float].
2728///
2729/// \headerfile <x86intrin.h>
2730///
2731/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2732///
2733/// \param __a
2734/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2735/// from the corresponding elements in this operand.
2736/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2737/// values from the operand.
2738static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2740{
2741 __m64 __b, __c;
2742 __m128 __r;
2743
2747 __r = _mm_setzero_ps();
2748 __r = _mm_cvtpi32_ps(__r, __c);
2749 __r = _mm_movelh_ps(__r, __r);
2751 __r = _mm_cvtpi32_ps(__r, __c);
2752
2753 return __r;
2754}
2755
2756/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2757/// 128-bit vector of [4 x float].
2758///
2759/// \headerfile <x86intrin.h>
2760///
2761/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2762///
2763/// \param __a
2764/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2765/// destination are copied from the corresponding elements in this operand.
2766/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2767/// values from the operand.
2768static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2770{
2771 __m64 __b, __c;
2772 __m128 __r;
2773
2776 __r = _mm_setzero_ps();
2777 __r = _mm_cvtpi32_ps(__r, __c);
2778 __r = _mm_movelh_ps(__r, __r);
2780 __r = _mm_cvtpi32_ps(__r, __c);
2781
2782 return __r;
2783}
2784
2785/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2786/// into a 128-bit vector of [4 x float].
2787///
2788/// \headerfile <x86intrin.h>
2789///
2790/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2791///
2792/// \param __a
2793/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2794/// from the corresponding lower 4 elements in this operand.
2795/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2796/// values from the operand.
2797static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2799{
2800 __m64 __b;
2801
2805
2806 return _mm_cvtpi16_ps(__b);
2807}
2808
2809/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2810/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2811///
2812/// \headerfile <x86intrin.h>
2813///
2814/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2815///
2816/// \param __a
2817/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2818/// destination are copied from the corresponding lower 4 elements in this
2819/// operand.
2820/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2821/// values from the source operand.
2822static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2824{
2825 __m64 __b;
2826
2829
2830 return _mm_cvtpi16_ps(__b);
2831}
2832
2833/// Converts the two 32-bit signed integer values from each 64-bit vector
2834/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2835///
2836/// \headerfile <x86intrin.h>
2837///
2838/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2839///
2840/// \param __a
2841/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2842/// copied from the elements in this operand.
2843/// \param __b
2844/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2845/// copied from the elements in this operand.
2846/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2847/// copied and converted values from the first operand. The upper 64 bits
2848/// contain the copied and converted values from the second operand.
2849static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2851{
2852 __m128 __c;
2853
2854 __c = _mm_setzero_ps();
2857
2858 return _mm_cvtpi32_ps(__c, __a);
2859}
2860
2861/// Converts each single-precision floating-point element of a 128-bit
2862/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2863/// packs the results into a 64-bit integer vector of [4 x i16].
2864///
2865/// If the floating-point element is NaN or infinity, or if the
2866/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2867/// it is converted to 0x8000. Otherwise if the floating-point element is
2868/// greater than 0x7FFF, it is converted to 0x7FFF.
2869///
2870/// \headerfile <x86intrin.h>
2871///
2872/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2873///
2874/// \param __a
2875/// A 128-bit floating-point vector of [4 x float].
2876/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2877/// values.
2878static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2880{
2881 __m64 __b, __c;
2882
2886
2887 return _mm_packs_pi32(__b, __c);
2888}
2889
2890/// Converts each single-precision floating-point element of a 128-bit
2891/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2892/// packs the results into the lower 32 bits of a 64-bit integer vector of
2893/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2894///
2895/// If the floating-point element is NaN or infinity, or if the
2896/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2897/// is converted to 0x80. Otherwise if the floating-point element is greater
2898/// than 0x7F, it is converted to 0x7F.
2899///
2900/// \headerfile <x86intrin.h>
2901///
2902/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2903///
2904/// \param __a
2905/// 128-bit floating-point vector of [4 x float].
2906/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2907/// converted values and the uppper 32 bits are set to zero.
2908static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2910{
2911 __m64 __b, __c;
2912
2915
2916 return _mm_packs_pi16(__b, __c);
2917}
2918
2919/// Extracts the sign bits from each single-precision floating-point
2920/// element of a 128-bit floating-point vector of [4 x float] and returns the
2921/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2922/// to zero.
2923///
2924/// \headerfile <x86intrin.h>
2925///
2926/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2927///
2928/// \param __a
2929/// A 128-bit floating-point vector of [4 x float].
2930/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2931/// single-precision floating-point element of the parameter. Bits [31:4] are
2932/// set to zero.
2933static __inline__ int __DEFAULT_FN_ATTRS
2935{
2936 return __builtin_ia32_movmskps((__v4sf)__a);
2937}
2938
2939/* Compare */
2940#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
2941#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
2942#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
2943#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
2944#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
2945#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
2946#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
2947#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
2948
2949/// Compares each of the corresponding values of two 128-bit vectors of
2950/// [4 x float], using the operation specified by the immediate integer
2951/// operand.
2952///
2953/// Returns a [4 x float] vector consisting of four floats corresponding to
2954/// the four comparison results: zero if the comparison is false, and all 1's
2955/// if the comparison is true.
2956///
2957/// \headerfile <x86intrin.h>
2958///
2959/// \code
2960/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
2961/// \endcode
2962///
2963/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
2964///
2965/// \param a
2966/// A 128-bit vector of [4 x float].
2967/// \param b
2968/// A 128-bit vector of [4 x float].
2969/// \param c
2970/// An immediate integer operand, with bits [4:0] specifying which comparison
2971/// operation to use: \n
2972/// (Note that without avx enabled, only bits [2:0] are supported) \n
2973/// 0x00: Equal (ordered, non-signaling) \n
2974/// 0x01: Less-than (ordered, signaling) \n
2975/// 0x02: Less-than-or-equal (ordered, signaling) \n
2976/// 0x03: Unordered (non-signaling) \n
2977/// 0x04: Not-equal (unordered, non-signaling) \n
2978/// 0x05: Not-less-than (unordered, signaling) \n
2979/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
2980/// 0x07: Ordered (non-signaling) \n
2981/// 0x08: Equal (unordered, non-signaling) \n
2982/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
2983/// 0x0A: Not-greater-than (unordered, signaling) \n
2984/// 0x0B: False (ordered, non-signaling) \n
2985/// 0x0C: Not-equal (ordered, non-signaling) \n
2986/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
2987/// 0x0E: Greater-than (ordered, signaling) \n
2988/// 0x0F: True (unordered, non-signaling) \n
2989/// 0x10: Equal (ordered, signaling) \n
2990/// 0x11: Less-than (ordered, non-signaling) \n
2991/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
2992/// 0x13: Unordered (signaling) \n
2993/// 0x14: Not-equal (unordered, signaling) \n
2994/// 0x15: Not-less-than (unordered, non-signaling) \n
2995/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
2996/// 0x17: Ordered (signaling) \n
2997/// 0x18: Equal (unordered, signaling) \n
2998/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
2999/// 0x1A: Not-greater-than (unordered, non-signaling) \n
3000/// 0x1B: False (ordered, signaling) \n
3001/// 0x1C: Not-equal (ordered, signaling) \n
3002/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
3003/// 0x1E: Greater-than (ordered, non-signaling) \n
3004/// 0x1F: True (unordered, signaling)
3005/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3006#define _mm_cmp_ps(a, b, c) \
3007 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3008
3009/// Compares each of the corresponding scalar values of two 128-bit
3010/// vectors of [4 x float], using the operation specified by the immediate
3011/// integer operand.
3012///
3013/// If the result is true, all 32 bits of the destination vector are set;
3014/// otherwise they are cleared.
3015///
3016/// \headerfile <x86intrin.h>
3017///
3018/// \code
3019/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3020/// \endcode
3021///
3022/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3023///
3024/// \param a
3025/// A 128-bit vector of [4 x float].
3026/// \param b
3027/// A 128-bit vector of [4 x float].
3028/// \param c
3029/// An immediate integer operand, with bits [4:0] specifying which comparison
3030/// operation to use: \n
3031/// (Note that without avx enabled, only bits [2:0] are supported) \n
3032/// 0x00: Equal (ordered, non-signaling) \n
3033/// 0x01: Less-than (ordered, signaling) \n
3034/// 0x02: Less-than-or-equal (ordered, signaling) \n
3035/// 0x03: Unordered (non-signaling) \n
3036/// 0x04: Not-equal (unordered, non-signaling) \n
3037/// 0x05: Not-less-than (unordered, signaling) \n
3038/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3039/// 0x07: Ordered (non-signaling) \n
3040/// 0x08: Equal (unordered, non-signaling) \n
3041/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
3042/// 0x0A: Not-greater-than (unordered, signaling) \n
3043/// 0x0B: False (ordered, non-signaling) \n
3044/// 0x0C: Not-equal (ordered, non-signaling) \n
3045/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
3046/// 0x0E: Greater-than (ordered, signaling) \n
3047/// 0x0F: True (unordered, non-signaling) \n
3048/// 0x10: Equal (ordered, signaling) \n
3049/// 0x11: Less-than (ordered, non-signaling) \n
3050/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
3051/// 0x13: Unordered (signaling) \n
3052/// 0x14: Not-equal (unordered, signaling) \n
3053/// 0x15: Not-less-than (unordered, non-signaling) \n
3054/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
3055/// 0x17: Ordered (signaling) \n
3056/// 0x18: Equal (unordered, signaling) \n
3057/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
3058/// 0x1A: Not-greater-than (unordered, non-signaling) \n
3059/// 0x1B: False (ordered, signaling) \n
3060/// 0x1C: Not-equal (ordered, signaling) \n
3061/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
3062/// 0x1E: Greater-than (ordered, non-signaling) \n
3063/// 0x1F: True (unordered, signaling)
3064/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3065#define _mm_cmp_ss(a, b, c) \
3066 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3067
3068#define _MM_ALIGN16 __attribute__((aligned(16)))
3069
3070#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3071
3072#define _MM_EXCEPT_INVALID (0x0001U)
3073#define _MM_EXCEPT_DENORM (0x0002U)
3074#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3075#define _MM_EXCEPT_OVERFLOW (0x0008U)
3076#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3077#define _MM_EXCEPT_INEXACT (0x0020U)
3078#define _MM_EXCEPT_MASK (0x003fU)
3079
3080#define _MM_MASK_INVALID (0x0080U)
3081#define _MM_MASK_DENORM (0x0100U)
3082#define _MM_MASK_DIV_ZERO (0x0200U)
3083#define _MM_MASK_OVERFLOW (0x0400U)
3084#define _MM_MASK_UNDERFLOW (0x0800U)
3085#define _MM_MASK_INEXACT (0x1000U)
3086#define _MM_MASK_MASK (0x1f80U)
3087
3088#define _MM_ROUND_NEAREST (0x0000U)
3089#define _MM_ROUND_DOWN (0x2000U)
3090#define _MM_ROUND_UP (0x4000U)
3091#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3092#define _MM_ROUND_MASK (0x6000U)
3093
3094#define _MM_FLUSH_ZERO_MASK (0x8000U)
3095#define _MM_FLUSH_ZERO_ON (0x8000U)
3096#define _MM_FLUSH_ZERO_OFF (0x0000U)
3097
3098#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3099#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3100#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3101#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3102
3103#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3104#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3105#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3106#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3107
3108#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3109do { \
3110 __m128 tmp3, tmp2, tmp1, tmp0; \
3111 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3112 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3113 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3114 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3115 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3116 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3117 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3118 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3119} while (0)
3120
3121/* Aliases for compatibility. */
3122#define _m_pextrw _mm_extract_pi16
3123#define _m_pinsrw _mm_insert_pi16
3124#define _m_pmaxsw _mm_max_pi16
3125#define _m_pmaxub _mm_max_pu8
3126#define _m_pminsw _mm_min_pi16
3127#define _m_pminub _mm_min_pu8
3128#define _m_pmovmskb _mm_movemask_pi8
3129#define _m_pmulhuw _mm_mulhi_pu16
3130#define _m_pshufw _mm_shuffle_pi16
3131#define _m_maskmovq _mm_maskmove_si64
3132#define _m_pavgb _mm_avg_pu8
3133#define _m_pavgw _mm_avg_pu16
3134#define _m_psadbw _mm_sad_pu8
3135#define _m_ _mm_
3136
3137#undef __DEFAULT_FN_ATTRS
3138#undef __DEFAULT_FN_ATTRS_MMX
3139
3140/* Ugly hack for backwards-compatibility (compatible with gcc) */
3141#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3142#include <emmintrin.h>
3143#endif
3144
3145#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__INLINE unsigned char unsigned int __x
Definition: adxintrin.h:58
__INLINE unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:59
__INLINE unsigned char unsigned int unsigned int __y
Definition: adxintrin.h:58
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ void int __a
Definition: emmintrin.h:3986
struct __storeu_i16 *__P __v
Definition: immintrin.h:504
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:241
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:289
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1251
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1229
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts 32-bit signed integers from both 64-bit integer vector parameters of [2 x i32] into 16-bit s...
Definition: mmintrin.h:161
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:312
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1286
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts 16-bit signed integers from both 64-bit integer vector parameters of [4 x i16] into 8-bit si...
Definition: mmintrin.h:131
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1093
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1397
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:253
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:549
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:218
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:200
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:884
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:507
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1907
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1362
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1800
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:408
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:530
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1324
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:489
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:74
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1727
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2329
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:723
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1069
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1378
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:430
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2066
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1780
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1455
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:792
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1189
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:573
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1517
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2124
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2739
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:306
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2909
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1945
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1214
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:681
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:1020
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2027
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2769
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1837
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:159
#define __DEFAULT_FN_ATTRS
Definition: xmmintrin.h:35
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:390
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:289
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1264
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2433
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1045
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1655
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1987
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:96
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:117
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1704
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2721
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:348
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2143
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1117
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1306
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:636
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:1892
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1239
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2634
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1966
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:837
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1628
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:467
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:270
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1818
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2008
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:448
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:235
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition: xmmintrin.h:704
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1607
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2254
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:139
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2273
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1494
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1472
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1416
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:2934
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2850
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2700
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1766
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:931
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:767
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1924
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:859
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:906
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:951
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:615
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1590
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1165
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:54
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1864
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2310
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:747
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2047
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:329
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2823
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:592
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
#define __DEFAULT_FN_ATTRS_MMX
Definition: xmmintrin.h:36
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2414
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2798
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2656
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:371
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2395
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2235
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1288
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2879
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1141
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2292
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:812
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2455
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1567
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:976
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:660
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:996
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1744
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1682
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:181