clang 19.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef int __v4si __attribute__((__vector_size__(16)));
20typedef float __v4sf __attribute__((__vector_size__(16)));
21typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25/* Unsigned types */
26typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28/* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30#if __STDC_HOSTED__
31#include <mm_malloc.h>
32#endif
33
34/* Define the default attributes for the functions in this file. */
35#define __DEFAULT_FN_ATTRS \
36 __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37 __min_vector_width__(128)))
38#define __DEFAULT_FN_ATTRS_MMX \
39 __attribute__((__always_inline__, __nodebug__, \
40 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41
42/// Adds the 32-bit float values in the low-order bits of the operands.
43///
44/// \headerfile <x86intrin.h>
45///
46/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47///
48/// \param __a
49/// A 128-bit vector of [4 x float] containing one of the source operands.
50/// The lower 32 bits of this operand are used in the calculation.
51/// \param __b
52/// A 128-bit vector of [4 x float] containing one of the source operands.
53/// The lower 32 bits of this operand are used in the calculation.
54/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55/// of the lower 32 bits of both operands. The upper 96 bits are copied from
56/// the upper 96 bits of the first source operand.
57static __inline__ __m128 __DEFAULT_FN_ATTRS
58_mm_add_ss(__m128 __a, __m128 __b)
59{
60 __a[0] += __b[0];
61 return __a;
62}
63
64/// Adds two 128-bit vectors of [4 x float], and returns the results of
65/// the addition.
66///
67/// \headerfile <x86intrin.h>
68///
69/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70///
71/// \param __a
72/// A 128-bit vector of [4 x float] containing one of the source operands.
73/// \param __b
74/// A 128-bit vector of [4 x float] containing one of the source operands.
75/// \returns A 128-bit vector of [4 x float] containing the sums of both
76/// operands.
77static __inline__ __m128 __DEFAULT_FN_ATTRS
78_mm_add_ps(__m128 __a, __m128 __b)
79{
80 return (__m128)((__v4sf)__a + (__v4sf)__b);
81}
82
83/// Subtracts the 32-bit float value in the low-order bits of the second
84/// operand from the corresponding value in the first operand.
85///
86/// \headerfile <x86intrin.h>
87///
88/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89///
90/// \param __a
91/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92/// of this operand are used in the calculation.
93/// \param __b
94/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95/// bits of this operand are used in the calculation.
96/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97/// difference of the lower 32 bits of both operands. The upper 96 bits are
98/// copied from the upper 96 bits of the first source operand.
99static __inline__ __m128 __DEFAULT_FN_ATTRS
100_mm_sub_ss(__m128 __a, __m128 __b)
101{
102 __a[0] -= __b[0];
103 return __a;
104}
105
106/// Subtracts each of the values of the second operand from the first
107/// operand, both of which are 128-bit vectors of [4 x float] and returns
108/// the results of the subtraction.
109///
110/// \headerfile <x86intrin.h>
111///
112/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113///
114/// \param __a
115/// A 128-bit vector of [4 x float] containing the minuend.
116/// \param __b
117/// A 128-bit vector of [4 x float] containing the subtrahend.
118/// \returns A 128-bit vector of [4 x float] containing the differences between
119/// both operands.
120static __inline__ __m128 __DEFAULT_FN_ATTRS
121_mm_sub_ps(__m128 __a, __m128 __b)
122{
123 return (__m128)((__v4sf)__a - (__v4sf)__b);
124}
125
126/// Multiplies two 32-bit float values in the low-order bits of the
127/// operands.
128///
129/// \headerfile <x86intrin.h>
130///
131/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132///
133/// \param __a
134/// A 128-bit vector of [4 x float] containing one of the source operands.
135/// The lower 32 bits of this operand are used in the calculation.
136/// \param __b
137/// A 128-bit vector of [4 x float] containing one of the source operands.
138/// The lower 32 bits of this operand are used in the calculation.
139/// \returns A 128-bit vector of [4 x float] containing the product of the lower
140/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
141/// bits of the first source operand.
142static __inline__ __m128 __DEFAULT_FN_ATTRS
143_mm_mul_ss(__m128 __a, __m128 __b)
144{
145 __a[0] *= __b[0];
146 return __a;
147}
148
149/// Multiplies two 128-bit vectors of [4 x float] and returns the
150/// results of the multiplication.
151///
152/// \headerfile <x86intrin.h>
153///
154/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155///
156/// \param __a
157/// A 128-bit vector of [4 x float] containing one of the source operands.
158/// \param __b
159/// A 128-bit vector of [4 x float] containing one of the source operands.
160/// \returns A 128-bit vector of [4 x float] containing the products of both
161/// operands.
162static __inline__ __m128 __DEFAULT_FN_ATTRS
163_mm_mul_ps(__m128 __a, __m128 __b)
164{
165 return (__m128)((__v4sf)__a * (__v4sf)__b);
166}
167
168/// Divides the value in the low-order 32 bits of the first operand by
169/// the corresponding value in the second operand.
170///
171/// \headerfile <x86intrin.h>
172///
173/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174///
175/// \param __a
176/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
177/// bits of this operand are used in the calculation.
178/// \param __b
179/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180/// of this operand are used in the calculation.
181/// \returns A 128-bit vector of [4 x float] containing the quotients of the
182/// lower 32 bits of both operands. The upper 96 bits are copied from the
183/// upper 96 bits of the first source operand.
184static __inline__ __m128 __DEFAULT_FN_ATTRS
185_mm_div_ss(__m128 __a, __m128 __b)
186{
187 __a[0] /= __b[0];
188 return __a;
189}
190
191/// Divides two 128-bit vectors of [4 x float].
192///
193/// \headerfile <x86intrin.h>
194///
195/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196///
197/// \param __a
198/// A 128-bit vector of [4 x float] containing the dividend.
199/// \param __b
200/// A 128-bit vector of [4 x float] containing the divisor.
201/// \returns A 128-bit vector of [4 x float] containing the quotients of both
202/// operands.
203static __inline__ __m128 __DEFAULT_FN_ATTRS
204_mm_div_ps(__m128 __a, __m128 __b)
205{
206 return (__m128)((__v4sf)__a / (__v4sf)__b);
207}
208
209/// Calculates the square root of the value stored in the low-order bits
210/// of a 128-bit vector of [4 x float].
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215///
216/// \param __a
217/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218/// used in the calculation.
219/// \returns A 128-bit vector of [4 x float] containing the square root of the
220/// value in the low-order bits of the operand.
221static __inline__ __m128 __DEFAULT_FN_ATTRS
223{
224 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225}
226
227/// Calculates the square roots of the values stored in a 128-bit vector
228/// of [4 x float].
229///
230/// \headerfile <x86intrin.h>
231///
232/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233///
234/// \param __a
235/// A 128-bit vector of [4 x float].
236/// \returns A 128-bit vector of [4 x float] containing the square roots of the
237/// values in the operand.
238static __inline__ __m128 __DEFAULT_FN_ATTRS
240{
241 return __builtin_ia32_sqrtps((__v4sf)__a);
242}
243
244/// Calculates the approximate reciprocal of the value stored in the
245/// low-order bits of a 128-bit vector of [4 x float].
246///
247/// \headerfile <x86intrin.h>
248///
249/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250///
251/// \param __a
252/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253/// used in the calculation.
254/// \returns A 128-bit vector of [4 x float] containing the approximate
255/// reciprocal of the value in the low-order bits of the operand.
256static __inline__ __m128 __DEFAULT_FN_ATTRS
258{
259 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260}
261
262/// Calculates the approximate reciprocals of the values stored in a
263/// 128-bit vector of [4 x float].
264///
265/// \headerfile <x86intrin.h>
266///
267/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268///
269/// \param __a
270/// A 128-bit vector of [4 x float].
271/// \returns A 128-bit vector of [4 x float] containing the approximate
272/// reciprocals of the values in the operand.
273static __inline__ __m128 __DEFAULT_FN_ATTRS
275{
276 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277}
278
279/// Calculates the approximate reciprocal of the square root of the value
280/// stored in the low-order bits of a 128-bit vector of [4 x float].
281///
282/// \headerfile <x86intrin.h>
283///
284/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285///
286/// \param __a
287/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288/// used in the calculation.
289/// \returns A 128-bit vector of [4 x float] containing the approximate
290/// reciprocal of the square root of the value in the low-order bits of the
291/// operand.
292static __inline__ __m128 __DEFAULT_FN_ATTRS
294{
295 return __builtin_ia32_rsqrtss((__v4sf)__a);
296}
297
298/// Calculates the approximate reciprocals of the square roots of the
299/// values stored in a 128-bit vector of [4 x float].
300///
301/// \headerfile <x86intrin.h>
302///
303/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304///
305/// \param __a
306/// A 128-bit vector of [4 x float].
307/// \returns A 128-bit vector of [4 x float] containing the approximate
308/// reciprocals of the square roots of the values in the operand.
309static __inline__ __m128 __DEFAULT_FN_ATTRS
311{
312 return __builtin_ia32_rsqrtps((__v4sf)__a);
313}
314
315/// Compares two 32-bit float values in the low-order bits of both
316/// operands and returns the lesser value in the low-order bits of the
317/// vector of [4 x float].
318///
319/// If either value in a comparison is NaN, returns the value from \a __b.
320///
321/// \headerfile <x86intrin.h>
322///
323/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
324///
325/// \param __a
326/// A 128-bit vector of [4 x float] containing one of the operands. The lower
327/// 32 bits of this operand are used in the comparison.
328/// \param __b
329/// A 128-bit vector of [4 x float] containing one of the operands. The lower
330/// 32 bits of this operand are used in the comparison.
331/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
332/// minimum value between both operands. The upper 96 bits are copied from
333/// the upper 96 bits of the first source operand.
334static __inline__ __m128 __DEFAULT_FN_ATTRS
335_mm_min_ss(__m128 __a, __m128 __b)
336{
337 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
338}
339
340/// Compares two 128-bit vectors of [4 x float] and returns the lesser
341/// of each pair of values.
342///
343/// If either value in a comparison is NaN, returns the value from \a __b.
344///
345/// \headerfile <x86intrin.h>
346///
347/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
348///
349/// \param __a
350/// A 128-bit vector of [4 x float] containing one of the operands.
351/// \param __b
352/// A 128-bit vector of [4 x float] containing one of the operands.
353/// \returns A 128-bit vector of [4 x float] containing the minimum values
354/// between both operands.
355static __inline__ __m128 __DEFAULT_FN_ATTRS
356_mm_min_ps(__m128 __a, __m128 __b)
357{
358 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
359}
360
361/// Compares two 32-bit float values in the low-order bits of both
362/// operands and returns the greater value in the low-order bits of a 128-bit
363/// vector of [4 x float].
364///
365/// If either value in a comparison is NaN, returns the value from \a __b.
366///
367/// \headerfile <x86intrin.h>
368///
369/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
370///
371/// \param __a
372/// A 128-bit vector of [4 x float] containing one of the operands. The lower
373/// 32 bits of this operand are used in the comparison.
374/// \param __b
375/// A 128-bit vector of [4 x float] containing one of the operands. The lower
376/// 32 bits of this operand are used in the comparison.
377/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378/// maximum value between both operands. The upper 96 bits are copied from
379/// the upper 96 bits of the first source operand.
380static __inline__ __m128 __DEFAULT_FN_ATTRS
381_mm_max_ss(__m128 __a, __m128 __b)
382{
383 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384}
385
386/// Compares two 128-bit vectors of [4 x float] and returns the greater
387/// of each pair of values.
388///
389/// If either value in a comparison is NaN, returns the value from \a __b.
390///
391/// \headerfile <x86intrin.h>
392///
393/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
394///
395/// \param __a
396/// A 128-bit vector of [4 x float] containing one of the operands.
397/// \param __b
398/// A 128-bit vector of [4 x float] containing one of the operands.
399/// \returns A 128-bit vector of [4 x float] containing the maximum values
400/// between both operands.
401static __inline__ __m128 __DEFAULT_FN_ATTRS
402_mm_max_ps(__m128 __a, __m128 __b)
403{
404 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
405}
406
407/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
408///
409/// \headerfile <x86intrin.h>
410///
411/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
412///
413/// \param __a
414/// A 128-bit vector containing one of the source operands.
415/// \param __b
416/// A 128-bit vector containing one of the source operands.
417/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
418/// values between both operands.
419static __inline__ __m128 __DEFAULT_FN_ATTRS
420_mm_and_ps(__m128 __a, __m128 __b)
421{
422 return (__m128)((__v4su)__a & (__v4su)__b);
423}
424
425/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
426/// the one's complement of the values contained in the first source
427/// operand.
428///
429/// \headerfile <x86intrin.h>
430///
431/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
432///
433/// \param __a
434/// A 128-bit vector of [4 x float] containing the first source operand. The
435/// one's complement of this value is used in the bitwise AND.
436/// \param __b
437/// A 128-bit vector of [4 x float] containing the second source operand.
438/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439/// one's complement of the first operand and the values in the second
440/// operand.
441static __inline__ __m128 __DEFAULT_FN_ATTRS
442_mm_andnot_ps(__m128 __a, __m128 __b)
443{
444 return (__m128)(~(__v4su)__a & (__v4su)__b);
445}
446
447/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
448///
449/// \headerfile <x86intrin.h>
450///
451/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
452///
453/// \param __a
454/// A 128-bit vector of [4 x float] containing one of the source operands.
455/// \param __b
456/// A 128-bit vector of [4 x float] containing one of the source operands.
457/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
458/// values between both operands.
459static __inline__ __m128 __DEFAULT_FN_ATTRS
460_mm_or_ps(__m128 __a, __m128 __b)
461{
462 return (__m128)((__v4su)__a | (__v4su)__b);
463}
464
465/// Performs a bitwise exclusive OR of two 128-bit vectors of
466/// [4 x float].
467///
468/// \headerfile <x86intrin.h>
469///
470/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
471///
472/// \param __a
473/// A 128-bit vector of [4 x float] containing one of the source operands.
474/// \param __b
475/// A 128-bit vector of [4 x float] containing one of the source operands.
476/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
477/// of the values between both operands.
478static __inline__ __m128 __DEFAULT_FN_ATTRS
479_mm_xor_ps(__m128 __a, __m128 __b)
480{
481 return (__m128)((__v4su)__a ^ (__v4su)__b);
482}
483
484/// Compares two 32-bit float values in the low-order bits of both
485/// operands for equality.
486///
487/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
488/// low-order bits of a vector [4 x float].
489/// If either value in a comparison is NaN, returns false.
490///
491/// \headerfile <x86intrin.h>
492///
493/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
494///
495/// \param __a
496/// A 128-bit vector of [4 x float] containing one of the operands. The lower
497/// 32 bits of this operand are used in the comparison.
498/// \param __b
499/// A 128-bit vector of [4 x float] containing one of the operands. The lower
500/// 32 bits of this operand are used in the comparison.
501/// \returns A 128-bit vector of [4 x float] containing the comparison results
502/// in the low-order bits.
503static __inline__ __m128 __DEFAULT_FN_ATTRS
504_mm_cmpeq_ss(__m128 __a, __m128 __b)
505{
506 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
507}
508
509/// Compares each of the corresponding 32-bit float values of the
510/// 128-bit vectors of [4 x float] for equality.
511///
512/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
513/// If either value in a comparison is NaN, returns false.
514///
515/// \headerfile <x86intrin.h>
516///
517/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
518///
519/// \param __a
520/// A 128-bit vector of [4 x float].
521/// \param __b
522/// A 128-bit vector of [4 x float].
523/// \returns A 128-bit vector of [4 x float] containing the comparison results.
524static __inline__ __m128 __DEFAULT_FN_ATTRS
525_mm_cmpeq_ps(__m128 __a, __m128 __b)
526{
527 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
528}
529
530/// Compares two 32-bit float values in the low-order bits of both
531/// operands to determine if the value in the first operand is less than the
532/// corresponding value in the second operand.
533///
534/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
535/// low-order bits of a vector of [4 x float].
536/// If either value in a comparison is NaN, returns false.
537///
538/// \headerfile <x86intrin.h>
539///
540/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
541///
542/// \param __a
543/// A 128-bit vector of [4 x float] containing one of the operands. The lower
544/// 32 bits of this operand are used in the comparison.
545/// \param __b
546/// A 128-bit vector of [4 x float] containing one of the operands. The lower
547/// 32 bits of this operand are used in the comparison.
548/// \returns A 128-bit vector of [4 x float] containing the comparison results
549/// in the low-order bits.
550static __inline__ __m128 __DEFAULT_FN_ATTRS
551_mm_cmplt_ss(__m128 __a, __m128 __b)
552{
553 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
554}
555
556/// Compares each of the corresponding 32-bit float values of the
557/// 128-bit vectors of [4 x float] to determine if the values in the first
558/// operand are less than those in the second operand.
559///
560/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561/// If either value in a comparison is NaN, returns false.
562///
563/// \headerfile <x86intrin.h>
564///
565/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
566///
567/// \param __a
568/// A 128-bit vector of [4 x float].
569/// \param __b
570/// A 128-bit vector of [4 x float].
571/// \returns A 128-bit vector of [4 x float] containing the comparison results.
572static __inline__ __m128 __DEFAULT_FN_ATTRS
573_mm_cmplt_ps(__m128 __a, __m128 __b)
574{
575 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
576}
577
578/// Compares two 32-bit float values in the low-order bits of both
579/// operands to determine if the value in the first operand is less than or
580/// equal to the corresponding value in the second operand.
581///
582/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
583/// the low-order bits of a vector of [4 x float].
584/// If either value in a comparison is NaN, returns false.
585///
586/// \headerfile <x86intrin.h>
587///
588/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
589///
590/// \param __a
591/// A 128-bit vector of [4 x float] containing one of the operands. The lower
592/// 32 bits of this operand are used in the comparison.
593/// \param __b
594/// A 128-bit vector of [4 x float] containing one of the operands. The lower
595/// 32 bits of this operand are used in the comparison.
596/// \returns A 128-bit vector of [4 x float] containing the comparison results
597/// in the low-order bits.
598static __inline__ __m128 __DEFAULT_FN_ATTRS
599_mm_cmple_ss(__m128 __a, __m128 __b)
600{
601 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
602}
603
604/// Compares each of the corresponding 32-bit float values of the
605/// 128-bit vectors of [4 x float] to determine if the values in the first
606/// operand are less than or equal to those in the second operand.
607///
608/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
609/// If either value in a comparison is NaN, returns false.
610///
611/// \headerfile <x86intrin.h>
612///
613/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
614///
615/// \param __a
616/// A 128-bit vector of [4 x float].
617/// \param __b
618/// A 128-bit vector of [4 x float].
619/// \returns A 128-bit vector of [4 x float] containing the comparison results.
620static __inline__ __m128 __DEFAULT_FN_ATTRS
621_mm_cmple_ps(__m128 __a, __m128 __b)
622{
623 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
624}
625
626/// Compares two 32-bit float values in the low-order bits of both
627/// operands to determine if the value in the first operand is greater than
628/// the corresponding value in the second operand.
629///
630/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
631/// low-order bits of a vector of [4 x float].
632/// If either value in a comparison is NaN, returns false.
633///
634/// \headerfile <x86intrin.h>
635///
636/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
637///
638/// \param __a
639/// A 128-bit vector of [4 x float] containing one of the operands. The lower
640/// 32 bits of this operand are used in the comparison.
641/// \param __b
642/// A 128-bit vector of [4 x float] containing one of the operands. The lower
643/// 32 bits of this operand are used in the comparison.
644/// \returns A 128-bit vector of [4 x float] containing the comparison results
645/// in the low-order bits.
646static __inline__ __m128 __DEFAULT_FN_ATTRS
647_mm_cmpgt_ss(__m128 __a, __m128 __b)
648{
649 return (__m128)__builtin_shufflevector((__v4sf)__a,
650 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
651 4, 1, 2, 3);
652}
653
654/// Compares each of the corresponding 32-bit float values of the
655/// 128-bit vectors of [4 x float] to determine if the values in the first
656/// operand are greater than those in the second operand.
657///
658/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
659/// If either value in a comparison is NaN, returns false.
660///
661/// \headerfile <x86intrin.h>
662///
663/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
664///
665/// \param __a
666/// A 128-bit vector of [4 x float].
667/// \param __b
668/// A 128-bit vector of [4 x float].
669/// \returns A 128-bit vector of [4 x float] containing the comparison results.
670static __inline__ __m128 __DEFAULT_FN_ATTRS
671_mm_cmpgt_ps(__m128 __a, __m128 __b)
672{
673 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
674}
675
676/// Compares two 32-bit float values in the low-order bits of both
677/// operands to determine if the value in the first operand is greater than
678/// or equal to the corresponding value in the second operand.
679///
680/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
681/// low-order bits of a vector of [4 x float].
682/// If either value in a comparison is NaN, returns false.
683///
684/// \headerfile <x86intrin.h>
685///
686/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
687///
688/// \param __a
689/// A 128-bit vector of [4 x float] containing one of the operands. The lower
690/// 32 bits of this operand are used in the comparison.
691/// \param __b
692/// A 128-bit vector of [4 x float] containing one of the operands. The lower
693/// 32 bits of this operand are used in the comparison.
694/// \returns A 128-bit vector of [4 x float] containing the comparison results
695/// in the low-order bits.
696static __inline__ __m128 __DEFAULT_FN_ATTRS
697_mm_cmpge_ss(__m128 __a, __m128 __b)
698{
699 return (__m128)__builtin_shufflevector((__v4sf)__a,
700 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
701 4, 1, 2, 3);
702}
703
704/// Compares each of the corresponding 32-bit float values of the
705/// 128-bit vectors of [4 x float] to determine if the values in the first
706/// operand are greater than or equal to those in the second operand.
707///
708/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709/// If either value in a comparison is NaN, returns false.
710///
711/// \headerfile <x86intrin.h>
712///
713/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
714///
715/// \param __a
716/// A 128-bit vector of [4 x float].
717/// \param __b
718/// A 128-bit vector of [4 x float].
719/// \returns A 128-bit vector of [4 x float] containing the comparison results.
720static __inline__ __m128 __DEFAULT_FN_ATTRS
721_mm_cmpge_ps(__m128 __a, __m128 __b)
722{
723 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
724}
725
726/// Compares two 32-bit float values in the low-order bits of both operands
727/// for inequality.
728///
729/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
730/// low-order bits of a vector of [4 x float].
731/// If either value in a comparison is NaN, returns true.
732///
733/// \headerfile <x86intrin.h>
734///
735/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
736/// instructions.
737///
738/// \param __a
739/// A 128-bit vector of [4 x float] containing one of the operands. The lower
740/// 32 bits of this operand are used in the comparison.
741/// \param __b
742/// A 128-bit vector of [4 x float] containing one of the operands. The lower
743/// 32 bits of this operand are used in the comparison.
744/// \returns A 128-bit vector of [4 x float] containing the comparison results
745/// in the low-order bits.
746static __inline__ __m128 __DEFAULT_FN_ATTRS
747_mm_cmpneq_ss(__m128 __a, __m128 __b)
748{
749 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
750}
751
752/// Compares each of the corresponding 32-bit float values of the
753/// 128-bit vectors of [4 x float] for inequality.
754///
755/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
756/// If either value in a comparison is NaN, returns true.
757///
758/// \headerfile <x86intrin.h>
759///
760/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
761/// instructions.
762///
763/// \param __a
764/// A 128-bit vector of [4 x float].
765/// \param __b
766/// A 128-bit vector of [4 x float].
767/// \returns A 128-bit vector of [4 x float] containing the comparison results.
768static __inline__ __m128 __DEFAULT_FN_ATTRS
769_mm_cmpneq_ps(__m128 __a, __m128 __b)
770{
771 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
772}
773
774/// Compares two 32-bit float values in the low-order bits of both
775/// operands to determine if the value in the first operand is not less than
776/// the corresponding value in the second operand.
777///
778/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
779/// low-order bits of a vector of [4 x float].
780/// If either value in a comparison is NaN, returns true.
781///
782/// \headerfile <x86intrin.h>
783///
784/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
785/// instructions.
786///
787/// \param __a
788/// A 128-bit vector of [4 x float] containing one of the operands. The lower
789/// 32 bits of this operand are used in the comparison.
790/// \param __b
791/// A 128-bit vector of [4 x float] containing one of the operands. The lower
792/// 32 bits of this operand are used in the comparison.
793/// \returns A 128-bit vector of [4 x float] containing the comparison results
794/// in the low-order bits.
795static __inline__ __m128 __DEFAULT_FN_ATTRS
796_mm_cmpnlt_ss(__m128 __a, __m128 __b)
797{
798 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
799}
800
801/// Compares each of the corresponding 32-bit float values of the
802/// 128-bit vectors of [4 x float] to determine if the values in the first
803/// operand are not less than those in the second operand.
804///
805/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
806/// If either value in a comparison is NaN, returns true.
807///
808/// \headerfile <x86intrin.h>
809///
810/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
811/// instructions.
812///
813/// \param __a
814/// A 128-bit vector of [4 x float].
815/// \param __b
816/// A 128-bit vector of [4 x float].
817/// \returns A 128-bit vector of [4 x float] containing the comparison results.
818static __inline__ __m128 __DEFAULT_FN_ATTRS
819_mm_cmpnlt_ps(__m128 __a, __m128 __b)
820{
821 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
822}
823
824/// Compares two 32-bit float values in the low-order bits of both
825/// operands to determine if the value in the first operand is not less than
826/// or equal to the corresponding value in the second operand.
827///
828/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
829/// low-order bits of a vector of [4 x float].
830/// If either value in a comparison is NaN, returns true.
831///
832/// \headerfile <x86intrin.h>
833///
834/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
835/// instructions.
836///
837/// \param __a
838/// A 128-bit vector of [4 x float] containing one of the operands. The lower
839/// 32 bits of this operand are used in the comparison.
840/// \param __b
841/// A 128-bit vector of [4 x float] containing one of the operands. The lower
842/// 32 bits of this operand are used in the comparison.
843/// \returns A 128-bit vector of [4 x float] containing the comparison results
844/// in the low-order bits.
845static __inline__ __m128 __DEFAULT_FN_ATTRS
846_mm_cmpnle_ss(__m128 __a, __m128 __b)
847{
848 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
849}
850
851/// Compares each of the corresponding 32-bit float values of the
852/// 128-bit vectors of [4 x float] to determine if the values in the first
853/// operand are not less than or equal to those in the second operand.
854///
855/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
856/// If either value in a comparison is NaN, returns true.
857///
858/// \headerfile <x86intrin.h>
859///
860/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
861/// instructions.
862///
863/// \param __a
864/// A 128-bit vector of [4 x float].
865/// \param __b
866/// A 128-bit vector of [4 x float].
867/// \returns A 128-bit vector of [4 x float] containing the comparison results.
868static __inline__ __m128 __DEFAULT_FN_ATTRS
869_mm_cmpnle_ps(__m128 __a, __m128 __b)
870{
871 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
872}
873
874/// Compares two 32-bit float values in the low-order bits of both
875/// operands to determine if the value in the first operand is not greater
876/// than the corresponding value in the second operand.
877///
878/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
879/// low-order bits of a vector of [4 x float].
880/// If either value in a comparison is NaN, returns true.
881///
882/// \headerfile <x86intrin.h>
883///
884/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
885/// instructions.
886///
887/// \param __a
888/// A 128-bit vector of [4 x float] containing one of the operands. The lower
889/// 32 bits of this operand are used in the comparison.
890/// \param __b
891/// A 128-bit vector of [4 x float] containing one of the operands. The lower
892/// 32 bits of this operand are used in the comparison.
893/// \returns A 128-bit vector of [4 x float] containing the comparison results
894/// in the low-order bits.
895static __inline__ __m128 __DEFAULT_FN_ATTRS
896_mm_cmpngt_ss(__m128 __a, __m128 __b)
897{
898 return (__m128)__builtin_shufflevector((__v4sf)__a,
899 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
900 4, 1, 2, 3);
901}
902
903/// Compares each of the corresponding 32-bit float values of the
904/// 128-bit vectors of [4 x float] to determine if the values in the first
905/// operand are not greater than those in the second operand.
906///
907/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
908/// If either value in a comparison is NaN, returns true.
909///
910/// \headerfile <x86intrin.h>
911///
912/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
913/// instructions.
914///
915/// \param __a
916/// A 128-bit vector of [4 x float].
917/// \param __b
918/// A 128-bit vector of [4 x float].
919/// \returns A 128-bit vector of [4 x float] containing the comparison results.
920static __inline__ __m128 __DEFAULT_FN_ATTRS
921_mm_cmpngt_ps(__m128 __a, __m128 __b)
922{
923 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
924}
925
926/// Compares two 32-bit float values in the low-order bits of both
927/// operands to determine if the value in the first operand is not greater
928/// than or equal to the corresponding value in the second operand.
929///
930/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
931/// low-order bits of a vector of [4 x float].
932/// If either value in a comparison is NaN, returns true.
933///
934/// \headerfile <x86intrin.h>
935///
936/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
937/// instructions.
938///
939/// \param __a
940/// A 128-bit vector of [4 x float] containing one of the operands. The lower
941/// 32 bits of this operand are used in the comparison.
942/// \param __b
943/// A 128-bit vector of [4 x float] containing one of the operands. The lower
944/// 32 bits of this operand are used in the comparison.
945/// \returns A 128-bit vector of [4 x float] containing the comparison results
946/// in the low-order bits.
947static __inline__ __m128 __DEFAULT_FN_ATTRS
948_mm_cmpnge_ss(__m128 __a, __m128 __b)
949{
950 return (__m128)__builtin_shufflevector((__v4sf)__a,
951 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
952 4, 1, 2, 3);
953}
954
955/// Compares each of the corresponding 32-bit float values of the
956/// 128-bit vectors of [4 x float] to determine if the values in the first
957/// operand are not greater than or equal to those in the second operand.
958///
959/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
960/// If either value in a comparison is NaN, returns true.
961///
962/// \headerfile <x86intrin.h>
963///
964/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
965/// instructions.
966///
967/// \param __a
968/// A 128-bit vector of [4 x float].
969/// \param __b
970/// A 128-bit vector of [4 x float].
971/// \returns A 128-bit vector of [4 x float] containing the comparison results.
972static __inline__ __m128 __DEFAULT_FN_ATTRS
973_mm_cmpnge_ps(__m128 __a, __m128 __b)
974{
975 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
976}
977
978/// Compares two 32-bit float values in the low-order bits of both
979/// operands to determine if the value in the first operand is ordered with
980/// respect to the corresponding value in the second operand.
981///
982/// A pair of floating-point values are ordered with respect to each
983/// other if neither value is a NaN. Each comparison returns 0x0 for false,
984/// 0xFFFFFFFF for true.
985///
986/// \headerfile <x86intrin.h>
987///
988/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
989/// instructions.
990///
991/// \param __a
992/// A 128-bit vector of [4 x float] containing one of the operands. The lower
993/// 32 bits of this operand are used in the comparison.
994/// \param __b
995/// A 128-bit vector of [4 x float] containing one of the operands. The lower
996/// 32 bits of this operand are used in the comparison.
997/// \returns A 128-bit vector of [4 x float] containing the comparison results
998/// in the low-order bits.
999static __inline__ __m128 __DEFAULT_FN_ATTRS
1000_mm_cmpord_ss(__m128 __a, __m128 __b)
1001{
1002 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1003}
1004
1005/// Compares each of the corresponding 32-bit float values of the
1006/// 128-bit vectors of [4 x float] to determine if the values in the first
1007/// operand are ordered with respect to those in the second operand.
1008///
1009/// A pair of floating-point values are ordered with respect to each
1010/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1011/// 0xFFFFFFFF for true.
1012///
1013/// \headerfile <x86intrin.h>
1014///
1015/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1016/// instructions.
1017///
1018/// \param __a
1019/// A 128-bit vector of [4 x float].
1020/// \param __b
1021/// A 128-bit vector of [4 x float].
1022/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1023static __inline__ __m128 __DEFAULT_FN_ATTRS
1024_mm_cmpord_ps(__m128 __a, __m128 __b)
1025{
1026 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1027}
1028
1029/// Compares two 32-bit float values in the low-order bits of both
1030/// operands to determine if the value in the first operand is unordered
1031/// with respect to the corresponding value in the second operand.
1032///
1033/// A pair of double-precision values are unordered with respect to each
1034/// other if one or both values are NaN. Each comparison returns 0x0 for
1035/// false, 0xFFFFFFFF for true.
1036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1040/// instructions.
1041///
1042/// \param __a
1043/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1044/// 32 bits of this operand are used in the comparison.
1045/// \param __b
1046/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1047/// 32 bits of this operand are used in the comparison.
1048/// \returns A 128-bit vector of [4 x float] containing the comparison results
1049/// in the low-order bits.
1050static __inline__ __m128 __DEFAULT_FN_ATTRS
1051_mm_cmpunord_ss(__m128 __a, __m128 __b)
1052{
1053 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1054}
1055
1056/// Compares each of the corresponding 32-bit float values of the
1057/// 128-bit vectors of [4 x float] to determine if the values in the first
1058/// operand are unordered with respect to those in the second operand.
1059///
1060/// A pair of double-precision values are unordered with respect to each
1061/// other if one or both values are NaN. Each comparison returns 0x0 for
1062/// false, 0xFFFFFFFFFFFFFFFF for true.
1063///
1064/// \headerfile <x86intrin.h>
1065///
1066/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1067/// instructions.
1068///
1069/// \param __a
1070/// A 128-bit vector of [4 x float].
1071/// \param __b
1072/// A 128-bit vector of [4 x float].
1073/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1074static __inline__ __m128 __DEFAULT_FN_ATTRS
1075_mm_cmpunord_ps(__m128 __a, __m128 __b)
1076{
1077 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1078}
1079
1080/// Compares two 32-bit float values in the low-order bits of both
1081/// operands for equality.
1082///
1083/// The comparison returns 0 for false, 1 for true. If either value in a
1084/// comparison is NaN, returns 0.
1085///
1086/// \headerfile <x86intrin.h>
1087///
1088/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1089/// instructions.
1090///
1091/// \param __a
1092/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093/// used in the comparison.
1094/// \param __b
1095/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1096/// used in the comparison.
1097/// \returns An integer containing the comparison results.
1098static __inline__ int __DEFAULT_FN_ATTRS
1099_mm_comieq_ss(__m128 __a, __m128 __b)
1100{
1101 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1102}
1103
1104/// Compares two 32-bit float values in the low-order bits of both
1105/// operands to determine if the first operand is less than the second
1106/// operand.
1107///
1108/// The comparison returns 0 for false, 1 for true. If either value in a
1109/// comparison is NaN, returns 0.
1110///
1111/// \headerfile <x86intrin.h>
1112///
1113/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1114/// instructions.
1115///
1116/// \param __a
1117/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1118/// used in the comparison.
1119/// \param __b
1120/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1121/// used in the comparison.
1122/// \returns An integer containing the comparison results.
1123static __inline__ int __DEFAULT_FN_ATTRS
1124_mm_comilt_ss(__m128 __a, __m128 __b)
1125{
1126 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1127}
1128
1129/// Compares two 32-bit float values in the low-order bits of both
1130/// operands to determine if the first operand is less than or equal to the
1131/// second operand.
1132///
1133/// The comparison returns 0 for false, 1 for true. If either value in a
1134/// comparison is NaN, returns 0.
1135///
1136/// \headerfile <x86intrin.h>
1137///
1138/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1139///
1140/// \param __a
1141/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142/// used in the comparison.
1143/// \param __b
1144/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1145/// used in the comparison.
1146/// \returns An integer containing the comparison results.
1147static __inline__ int __DEFAULT_FN_ATTRS
1148_mm_comile_ss(__m128 __a, __m128 __b)
1149{
1150 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1151}
1152
1153/// Compares two 32-bit float values in the low-order bits of both
1154/// operands to determine if the first operand is greater than the second
1155/// operand.
1156///
1157/// The comparison returns 0 for false, 1 for true. If either value in a
1158/// comparison is NaN, returns 0.
1159///
1160/// \headerfile <x86intrin.h>
1161///
1162/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1163///
1164/// \param __a
1165/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166/// used in the comparison.
1167/// \param __b
1168/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169/// used in the comparison.
1170/// \returns An integer containing the comparison results.
1171static __inline__ int __DEFAULT_FN_ATTRS
1172_mm_comigt_ss(__m128 __a, __m128 __b)
1173{
1174 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1175}
1176
1177/// Compares two 32-bit float values in the low-order bits of both
1178/// operands to determine if the first operand is greater than or equal to
1179/// the second operand.
1180///
1181/// The comparison returns 0 for false, 1 for true. If either value in a
1182/// comparison is NaN, returns 0.
1183///
1184/// \headerfile <x86intrin.h>
1185///
1186/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1187///
1188/// \param __a
1189/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190/// used in the comparison.
1191/// \param __b
1192/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1193/// used in the comparison.
1194/// \returns An integer containing the comparison results.
1195static __inline__ int __DEFAULT_FN_ATTRS
1196_mm_comige_ss(__m128 __a, __m128 __b)
1197{
1198 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1199}
1200
1201/// Compares two 32-bit float values in the low-order bits of both
1202/// operands to determine if the first operand is not equal to the second
1203/// operand.
1204///
1205/// The comparison returns 0 for false, 1 for true. If either value in a
1206/// comparison is NaN, returns 1.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1211///
1212/// \param __a
1213/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214/// used in the comparison.
1215/// \param __b
1216/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1217/// used in the comparison.
1218/// \returns An integer containing the comparison results.
1219static __inline__ int __DEFAULT_FN_ATTRS
1220_mm_comineq_ss(__m128 __a, __m128 __b)
1221{
1222 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1223}
1224
1225/// Performs an unordered comparison of two 32-bit float values using
1226/// the low-order bits of both operands to determine equality.
1227///
1228/// The comparison returns 0 for false, 1 for true. If either value in a
1229/// comparison is NaN, returns 0.
1230///
1231/// \headerfile <x86intrin.h>
1232///
1233/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1234///
1235/// \param __a
1236/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1237/// used in the comparison.
1238/// \param __b
1239/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240/// used in the comparison.
1241/// \returns An integer containing the comparison results.
1242static __inline__ int __DEFAULT_FN_ATTRS
1243_mm_ucomieq_ss(__m128 __a, __m128 __b)
1244{
1245 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1246}
1247
1248/// Performs an unordered comparison of two 32-bit float values using
1249/// the low-order bits of both operands to determine if the first operand is
1250/// less than the second operand.
1251///
1252/// The comparison returns 0 for false, 1 for true. If either value in a
1253/// comparison is NaN, returns 0.
1254///
1255/// \headerfile <x86intrin.h>
1256///
1257/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258///
1259/// \param __a
1260/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261/// used in the comparison.
1262/// \param __b
1263/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264/// used in the comparison.
1265/// \returns An integer containing the comparison results.
1266static __inline__ int __DEFAULT_FN_ATTRS
1267_mm_ucomilt_ss(__m128 __a, __m128 __b)
1268{
1269 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1270}
1271
1272/// Performs an unordered comparison of two 32-bit float values using
1273/// the low-order bits of both operands to determine if the first operand is
1274/// less than or equal to the second operand.
1275///
1276/// The comparison returns 0 for false, 1 for true. If either value in a
1277/// comparison is NaN, returns 0.
1278///
1279/// \headerfile <x86intrin.h>
1280///
1281/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282///
1283/// \param __a
1284/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285/// used in the comparison.
1286/// \param __b
1287/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288/// used in the comparison.
1289/// \returns An integer containing the comparison results.
1290static __inline__ int __DEFAULT_FN_ATTRS
1291_mm_ucomile_ss(__m128 __a, __m128 __b)
1292{
1293 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1294}
1295
1296/// Performs an unordered comparison of two 32-bit float values using
1297/// the low-order bits of both operands to determine if the first operand is
1298/// greater than the second operand.
1299///
1300/// The comparison returns 0 for false, 1 for true. If either value in a
1301/// comparison is NaN, returns 0.
1302///
1303/// \headerfile <x86intrin.h>
1304///
1305/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1306///
1307/// \param __a
1308/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309/// used in the comparison.
1310/// \param __b
1311/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1312/// used in the comparison.
1313/// \returns An integer containing the comparison results.
1314static __inline__ int __DEFAULT_FN_ATTRS
1315_mm_ucomigt_ss(__m128 __a, __m128 __b)
1316{
1317 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1318}
1319
1320/// Performs an unordered comparison of two 32-bit float values using
1321/// the low-order bits of both operands to determine if the first operand is
1322/// greater than or equal to the second operand.
1323///
1324/// The comparison returns 0 for false, 1 for true. If either value in a
1325/// comparison is NaN, returns 0.
1326///
1327/// \headerfile <x86intrin.h>
1328///
1329/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1330///
1331/// \param __a
1332/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333/// used in the comparison.
1334/// \param __b
1335/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1336/// used in the comparison.
1337/// \returns An integer containing the comparison results.
1338static __inline__ int __DEFAULT_FN_ATTRS
1339_mm_ucomige_ss(__m128 __a, __m128 __b)
1340{
1341 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1342}
1343
1344/// Performs an unordered comparison of two 32-bit float values using
1345/// the low-order bits of both operands to determine inequality.
1346///
1347/// The comparison returns 0 for false, 1 for true. If either value in a
1348/// comparison is NaN, returns 0.
1349///
1350/// \headerfile <x86intrin.h>
1351///
1352/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1353///
1354/// \param __a
1355/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1356/// used in the comparison.
1357/// \param __b
1358/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1359/// used in the comparison.
1360/// \returns An integer containing the comparison results.
1361static __inline__ int __DEFAULT_FN_ATTRS
1362_mm_ucomineq_ss(__m128 __a, __m128 __b)
1363{
1364 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1365}
1366
1367/// Converts a float value contained in the lower 32 bits of a vector of
1368/// [4 x float] into a 32-bit integer.
1369///
1370/// If the converted value does not fit in a 32-bit integer, raises a
1371/// floating-point invalid exception. If the exception is masked, returns
1372/// the most negative integer.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1377/// instructions.
1378///
1379/// \param __a
1380/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1381/// used in the conversion.
1382/// \returns A 32-bit integer containing the converted value.
1383static __inline__ int __DEFAULT_FN_ATTRS
1385{
1386 return __builtin_ia32_cvtss2si((__v4sf)__a);
1387}
1388
1389/// Converts a float value contained in the lower 32 bits of a vector of
1390/// [4 x float] into a 32-bit integer.
1391///
1392/// If the converted value does not fit in a 32-bit integer, raises a
1393/// floating-point invalid exception. If the exception is masked, returns
1394/// the most negative integer.
1395///
1396/// \headerfile <x86intrin.h>
1397///
1398/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1399/// instructions.
1400///
1401/// \param __a
1402/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1403/// used in the conversion.
1404/// \returns A 32-bit integer containing the converted value.
1405static __inline__ int __DEFAULT_FN_ATTRS
1407{
1408 return _mm_cvtss_si32(__a);
1409}
1410
1411#ifdef __x86_64__
1412
1413/// Converts a float value contained in the lower 32 bits of a vector of
1414/// [4 x float] into a 64-bit integer.
1415///
1416/// If the converted value does not fit in a 32-bit integer, raises a
1417/// floating-point invalid exception. If the exception is masked, returns
1418/// the most negative integer.
1419///
1420/// \headerfile <x86intrin.h>
1421///
1422/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1423/// instructions.
1424///
1425/// \param __a
1426/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1427/// used in the conversion.
1428/// \returns A 64-bit integer containing the converted value.
1429static __inline__ long long __DEFAULT_FN_ATTRS
1430_mm_cvtss_si64(__m128 __a)
1431{
1432 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1433}
1434
1435#endif
1436
1437/// Converts two low-order float values in a 128-bit vector of
1438/// [4 x float] into a 64-bit vector of [2 x i32].
1439///
1440/// If a converted value does not fit in a 32-bit integer, raises a
1441/// floating-point invalid exception. If the exception is masked, returns
1442/// the most negative integer.
1443///
1444/// \headerfile <x86intrin.h>
1445///
1446/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1447///
1448/// \param __a
1449/// A 128-bit vector of [4 x float].
1450/// \returns A 64-bit integer vector containing the converted values.
1451static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1453{
1454 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1455}
1456
1457/// Converts two low-order float values in a 128-bit vector of
1458/// [4 x float] into a 64-bit vector of [2 x i32].
1459///
1460/// If a converted value does not fit in a 32-bit integer, raises a
1461/// floating-point invalid exception. If the exception is masked, returns
1462/// the most negative integer.
1463///
1464/// \headerfile <x86intrin.h>
1465///
1466/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1467///
1468/// \param __a
1469/// A 128-bit vector of [4 x float].
1470/// \returns A 64-bit integer vector containing the converted values.
1471static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1473{
1474 return _mm_cvtps_pi32(__a);
1475}
1476
1477/// Converts the lower (first) element of a vector of [4 x float] into a signed
1478/// truncated (rounded toward zero) 32-bit integer.
1479///
1480/// If the converted value does not fit in a 32-bit integer, raises a
1481/// floating-point invalid exception. If the exception is masked, returns
1482/// the most negative integer.
1483///
1484/// \headerfile <x86intrin.h>
1485///
1486/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1487/// instructions.
1488///
1489/// \param __a
1490/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1491/// used in the conversion.
1492/// \returns A 32-bit integer containing the converted value.
1493static __inline__ int __DEFAULT_FN_ATTRS
1495{
1496 return __builtin_ia32_cvttss2si((__v4sf)__a);
1497}
1498
1499/// Converts the lower (first) element of a vector of [4 x float] into a signed
1500/// truncated (rounded toward zero) 32-bit integer.
1501///
1502/// If the converted value does not fit in a 32-bit integer, raises a
1503/// floating-point invalid exception. If the exception is masked, returns
1504/// the most negative integer.
1505///
1506/// \headerfile <x86intrin.h>
1507///
1508/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1509/// instructions.
1510///
1511/// \param __a
1512/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1513/// used in the conversion.
1514/// \returns A 32-bit integer containing the converted value.
1515static __inline__ int __DEFAULT_FN_ATTRS
1517{
1518 return _mm_cvttss_si32(__a);
1519}
1520
1521#ifdef __x86_64__
1522/// Converts the lower (first) element of a vector of [4 x float] into a signed
1523/// truncated (rounded toward zero) 64-bit integer.
1524///
1525/// If the converted value does not fit in a 64-bit integer, raises a
1526/// floating-point invalid exception. If the exception is masked, returns
1527/// the most negative integer.
1528///
1529/// \headerfile <x86intrin.h>
1530///
1531/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1532/// instructions.
1533///
1534/// \param __a
1535/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1536/// used in the conversion.
1537/// \returns A 64-bit integer containing the converted value.
1538static __inline__ long long __DEFAULT_FN_ATTRS
1539_mm_cvttss_si64(__m128 __a)
1540{
1541 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1542}
1543#endif
1544
1545/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1546/// into two signed truncated (rounded toward zero) 32-bit integers,
1547/// returned in a 64-bit vector of [2 x i32].
1548///
1549/// If a converted value does not fit in a 32-bit integer, raises a
1550/// floating-point invalid exception. If the exception is masked, returns
1551/// the most negative integer.
1552///
1553/// \headerfile <x86intrin.h>
1554///
1555/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1556/// instructions.
1557///
1558/// \param __a
1559/// A 128-bit vector of [4 x float].
1560/// \returns A 64-bit integer vector containing the converted values.
1561static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1563{
1564 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1565}
1566
1567/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1568/// into two signed truncated (rounded toward zero) 64-bit integers,
1569/// returned in a 64-bit vector of [2 x i32].
1570///
1571/// If a converted value does not fit in a 32-bit integer, raises a
1572/// floating-point invalid exception. If the exception is masked, returns
1573/// the most negative integer.
1574///
1575/// \headerfile <x86intrin.h>
1576///
1577/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1578///
1579/// \param __a
1580/// A 128-bit vector of [4 x float].
1581/// \returns A 64-bit integer vector containing the converted values.
1582static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1584{
1585 return _mm_cvttps_pi32(__a);
1586}
1587
1588/// Converts a 32-bit signed integer value into a floating point value
1589/// and writes it to the lower 32 bits of the destination. The remaining
1590/// higher order elements of the destination vector are copied from the
1591/// corresponding elements in the first operand.
1592///
1593/// \headerfile <x86intrin.h>
1594///
1595/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1596///
1597/// \param __a
1598/// A 128-bit vector of [4 x float].
1599/// \param __b
1600/// A 32-bit signed integer operand containing the value to be converted.
1601/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1602/// converted value of the second operand. The upper 96 bits are copied from
1603/// the upper 96 bits of the first operand.
1604static __inline__ __m128 __DEFAULT_FN_ATTRS
1606{
1607 __a[0] = __b;
1608 return __a;
1609}
1610
1611/// Converts a 32-bit signed integer value into a floating point value
1612/// and writes it to the lower 32 bits of the destination. The remaining
1613/// higher order elements of the destination are copied from the
1614/// corresponding elements in the first operand.
1615///
1616/// \headerfile <x86intrin.h>
1617///
1618/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1619///
1620/// \param __a
1621/// A 128-bit vector of [4 x float].
1622/// \param __b
1623/// A 32-bit signed integer operand containing the value to be converted.
1624/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1625/// converted value of the second operand. The upper 96 bits are copied from
1626/// the upper 96 bits of the first operand.
1627static __inline__ __m128 __DEFAULT_FN_ATTRS
1629{
1630 return _mm_cvtsi32_ss(__a, __b);
1631}
1632
1633#ifdef __x86_64__
1634
1635/// Converts a 64-bit signed integer value into a floating point value
1636/// and writes it to the lower 32 bits of the destination. The remaining
1637/// higher order elements of the destination are copied from the
1638/// corresponding elements in the first operand.
1639///
1640/// \headerfile <x86intrin.h>
1641///
1642/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1643///
1644/// \param __a
1645/// A 128-bit vector of [4 x float].
1646/// \param __b
1647/// A 64-bit signed integer operand containing the value to be converted.
1648/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1649/// converted value of the second operand. The upper 96 bits are copied from
1650/// the upper 96 bits of the first operand.
1651static __inline__ __m128 __DEFAULT_FN_ATTRS
1652_mm_cvtsi64_ss(__m128 __a, long long __b)
1653{
1654 __a[0] = __b;
1655 return __a;
1656}
1657
1658#endif
1659
1660/// Converts two elements of a 64-bit vector of [2 x i32] into two
1661/// floating point values and writes them to the lower 64-bits of the
1662/// destination. The remaining higher order elements of the destination are
1663/// copied from the corresponding elements in the first operand.
1664///
1665/// \headerfile <x86intrin.h>
1666///
1667/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1668///
1669/// \param __a
1670/// A 128-bit vector of [4 x float].
1671/// \param __b
1672/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1673/// and written to the corresponding low-order elements in the destination.
1674/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1675/// converted value of the second operand. The upper 64 bits are copied from
1676/// the upper 64 bits of the first operand.
1677static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1678_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1679{
1680 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1681}
1682
1683/// Converts two elements of a 64-bit vector of [2 x i32] into two
1684/// floating point values and writes them to the lower 64-bits of the
1685/// destination. The remaining higher order elements of the destination are
1686/// copied from the corresponding elements in the first operand.
1687///
1688/// \headerfile <x86intrin.h>
1689///
1690/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1691///
1692/// \param __a
1693/// A 128-bit vector of [4 x float].
1694/// \param __b
1695/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1696/// and written to the corresponding low-order elements in the destination.
1697/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1698/// converted value from the second operand. The upper 64 bits are copied
1699/// from the upper 64 bits of the first operand.
1700static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1701_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1702{
1703 return _mm_cvtpi32_ps(__a, __b);
1704}
1705
1706/// Extracts a float value contained in the lower 32 bits of a vector of
1707/// [4 x float].
1708///
1709/// \headerfile <x86intrin.h>
1710///
1711/// This intrinsic has no corresponding instruction.
1712///
1713/// \param __a
1714/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1715/// used in the extraction.
1716/// \returns A 32-bit float containing the extracted value.
1717static __inline__ float __DEFAULT_FN_ATTRS
1719{
1720 return __a[0];
1721}
1722
1723/// Loads two packed float values from the address \a __p into the
1724/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1725/// are copied from the low-order bits of the first operand.
1726///
1727/// \headerfile <x86intrin.h>
1728///
1729/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1730///
1731/// \param __a
1732/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1733/// of the destination.
1734/// \param __p
1735/// A pointer to two packed float values. Bits [63:0] are written to bits
1736/// [127:64] of the destination.
1737/// \returns A 128-bit vector of [4 x float] containing the moved values.
1738static __inline__ __m128 __DEFAULT_FN_ATTRS
1739_mm_loadh_pi(__m128 __a, const __m64 *__p)
1740{
1741 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1742 struct __mm_loadh_pi_struct {
1743 __mm_loadh_pi_v2f32 __u;
1744 } __attribute__((__packed__, __may_alias__));
1745 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1746 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1747 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1748}
1749
1750/// Loads two packed float values from the address \a __p into the
1751/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1752/// are copied from the high-order bits of the first operand.
1753///
1754/// \headerfile <x86intrin.h>
1755///
1756/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1757///
1758/// \param __a
1759/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1760/// [127:64] of the destination.
1761/// \param __p
1762/// A pointer to two packed float values. Bits [63:0] are written to bits
1763/// [63:0] of the destination.
1764/// \returns A 128-bit vector of [4 x float] containing the moved values.
1765static __inline__ __m128 __DEFAULT_FN_ATTRS
1766_mm_loadl_pi(__m128 __a, const __m64 *__p)
1767{
1768 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1769 struct __mm_loadl_pi_struct {
1770 __mm_loadl_pi_v2f32 __u;
1771 } __attribute__((__packed__, __may_alias__));
1772 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1773 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1774 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1775}
1776
1777/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1778/// 32 bits of the vector are initialized with the single-precision
1779/// floating-point value loaded from a specified memory location. The upper
1780/// 96 bits are set to zero.
1781///
1782/// \headerfile <x86intrin.h>
1783///
1784/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1785///
1786/// \param __p
1787/// A pointer to a 32-bit memory location containing a single-precision
1788/// floating-point value.
1789/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1790/// lower 32 bits contain the value loaded from the memory location. The
1791/// upper 96 bits are set to zero.
1792static __inline__ __m128 __DEFAULT_FN_ATTRS
1793_mm_load_ss(const float *__p)
1794{
1795 struct __mm_load_ss_struct {
1796 float __u;
1797 } __attribute__((__packed__, __may_alias__));
1798 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1799 return __extension__ (__m128){ __u, 0, 0, 0 };
1800}
1801
1802/// Loads a 32-bit float value and duplicates it to all four vector
1803/// elements of a 128-bit vector of [4 x float].
1804///
1805/// \headerfile <x86intrin.h>
1806///
1807/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1808/// instruction.
1809///
1810/// \param __p
1811/// A pointer to a float value to be loaded and duplicated.
1812/// \returns A 128-bit vector of [4 x float] containing the loaded and
1813/// duplicated values.
1814static __inline__ __m128 __DEFAULT_FN_ATTRS
1815_mm_load1_ps(const float *__p)
1816{
1817 struct __mm_load1_ps_struct {
1818 float __u;
1819 } __attribute__((__packed__, __may_alias__));
1820 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1821 return __extension__ (__m128){ __u, __u, __u, __u };
1822}
1823
1824#define _mm_load_ps1(p) _mm_load1_ps(p)
1825
1826/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1827/// memory location.
1828///
1829/// \headerfile <x86intrin.h>
1830///
1831/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1832///
1833/// \param __p
1834/// A pointer to a 128-bit memory location. The address of the memory
1835/// location has to be 128-bit aligned.
1836/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1837static __inline__ __m128 __DEFAULT_FN_ATTRS
1838_mm_load_ps(const float *__p)
1839{
1840 return *(const __m128*)__p;
1841}
1842
1843/// Loads a 128-bit floating-point vector of [4 x float] from an
1844/// unaligned memory location.
1845///
1846/// \headerfile <x86intrin.h>
1847///
1848/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1849///
1850/// \param __p
1851/// A pointer to a 128-bit memory location. The address of the memory
1852/// location does not have to be aligned.
1853/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1854static __inline__ __m128 __DEFAULT_FN_ATTRS
1855_mm_loadu_ps(const float *__p)
1856{
1857 struct __loadu_ps {
1858 __m128_u __v;
1859 } __attribute__((__packed__, __may_alias__));
1860 return ((const struct __loadu_ps*)__p)->__v;
1861}
1862
1863/// Loads four packed float values, in reverse order, from an aligned
1864/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1865///
1866/// \headerfile <x86intrin.h>
1867///
1868/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1869/// instruction.
1870///
1871/// \param __p
1872/// A pointer to a 128-bit memory location. The address of the memory
1873/// location has to be 128-bit aligned.
1874/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1875/// in reverse order.
1876static __inline__ __m128 __DEFAULT_FN_ATTRS
1877_mm_loadr_ps(const float *__p)
1878{
1879 __m128 __a = _mm_load_ps(__p);
1880 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1881}
1882
1883/// Create a 128-bit vector of [4 x float] with undefined values.
1884///
1885/// \headerfile <x86intrin.h>
1886///
1887/// This intrinsic has no corresponding instruction.
1888///
1889/// \returns A 128-bit vector of [4 x float] containing undefined values.
1890static __inline__ __m128 __DEFAULT_FN_ATTRS
1892{
1893 return (__m128)__builtin_ia32_undef128();
1894}
1895
1896/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1897/// 32 bits of the vector are initialized with the specified single-precision
1898/// floating-point value. The upper 96 bits are set to zero.
1899///
1900/// \headerfile <x86intrin.h>
1901///
1902/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1903///
1904/// \param __w
1905/// A single-precision floating-point value used to initialize the lower 32
1906/// bits of the result.
1907/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1908/// lower 32 bits contain the value provided in the source operand. The
1909/// upper 96 bits are set to zero.
1910static __inline__ __m128 __DEFAULT_FN_ATTRS
1911_mm_set_ss(float __w)
1912{
1913 return __extension__ (__m128){ __w, 0, 0, 0 };
1914}
1915
1916/// Constructs a 128-bit floating-point vector of [4 x float], with each
1917/// of the four single-precision floating-point vector elements set to the
1918/// specified single-precision floating-point value.
1919///
1920/// \headerfile <x86intrin.h>
1921///
1922/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1923///
1924/// \param __w
1925/// A single-precision floating-point value used to initialize each vector
1926/// element of the result.
1927/// \returns An initialized 128-bit floating-point vector of [4 x float].
1928static __inline__ __m128 __DEFAULT_FN_ATTRS
1929_mm_set1_ps(float __w)
1930{
1931 return __extension__ (__m128){ __w, __w, __w, __w };
1932}
1933
1934/* Microsoft specific. */
1935/// Constructs a 128-bit floating-point vector of [4 x float], with each
1936/// of the four single-precision floating-point vector elements set to the
1937/// specified single-precision floating-point value.
1938///
1939/// \headerfile <x86intrin.h>
1940///
1941/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1942///
1943/// \param __w
1944/// A single-precision floating-point value used to initialize each vector
1945/// element of the result.
1946/// \returns An initialized 128-bit floating-point vector of [4 x float].
1947static __inline__ __m128 __DEFAULT_FN_ATTRS
1948_mm_set_ps1(float __w)
1949{
1950 return _mm_set1_ps(__w);
1951}
1952
1953/// Constructs a 128-bit floating-point vector of [4 x float]
1954/// initialized with the specified single-precision floating-point values.
1955///
1956/// \headerfile <x86intrin.h>
1957///
1958/// This intrinsic is a utility function and does not correspond to a specific
1959/// instruction.
1960///
1961/// \param __z
1962/// A single-precision floating-point value used to initialize bits [127:96]
1963/// of the result.
1964/// \param __y
1965/// A single-precision floating-point value used to initialize bits [95:64]
1966/// of the result.
1967/// \param __x
1968/// A single-precision floating-point value used to initialize bits [63:32]
1969/// of the result.
1970/// \param __w
1971/// A single-precision floating-point value used to initialize bits [31:0]
1972/// of the result.
1973/// \returns An initialized 128-bit floating-point vector of [4 x float].
1974static __inline__ __m128 __DEFAULT_FN_ATTRS
1975_mm_set_ps(float __z, float __y, float __x, float __w)
1976{
1977 return __extension__ (__m128){ __w, __x, __y, __z };
1978}
1979
1980/// Constructs a 128-bit floating-point vector of [4 x float],
1981/// initialized in reverse order with the specified 32-bit single-precision
1982/// float-point values.
1983///
1984/// \headerfile <x86intrin.h>
1985///
1986/// This intrinsic is a utility function and does not correspond to a specific
1987/// instruction.
1988///
1989/// \param __z
1990/// A single-precision floating-point value used to initialize bits [31:0]
1991/// of the result.
1992/// \param __y
1993/// A single-precision floating-point value used to initialize bits [63:32]
1994/// of the result.
1995/// \param __x
1996/// A single-precision floating-point value used to initialize bits [95:64]
1997/// of the result.
1998/// \param __w
1999/// A single-precision floating-point value used to initialize bits [127:96]
2000/// of the result.
2001/// \returns An initialized 128-bit floating-point vector of [4 x float].
2002static __inline__ __m128 __DEFAULT_FN_ATTRS
2003_mm_setr_ps(float __z, float __y, float __x, float __w)
2004{
2005 return __extension__ (__m128){ __z, __y, __x, __w };
2006}
2007
2008/// Constructs a 128-bit floating-point vector of [4 x float] initialized
2009/// to zero.
2010///
2011/// \headerfile <x86intrin.h>
2012///
2013/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2014///
2015/// \returns An initialized 128-bit floating-point vector of [4 x float] with
2016/// all elements set to zero.
2017static __inline__ __m128 __DEFAULT_FN_ATTRS
2019{
2020 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2021}
2022
2023/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2024/// memory location.
2025///
2026/// \headerfile <x86intrin.h>
2027///
2028/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2029///
2030/// \param __p
2031/// A pointer to a 64-bit memory location.
2032/// \param __a
2033/// A 128-bit vector of [4 x float] containing the values to be stored.
2034static __inline__ void __DEFAULT_FN_ATTRS
2035_mm_storeh_pi(__m64 *__p, __m128 __a)
2036{
2037 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2038 struct __mm_storeh_pi_struct {
2039 __mm_storeh_pi_v2f32 __u;
2040 } __attribute__((__packed__, __may_alias__));
2041 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2042}
2043
2044/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2045/// memory location.
2046///
2047/// \headerfile <x86intrin.h>
2048///
2049/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2050///
2051/// \param __p
2052/// A pointer to a memory location that will receive the float values.
2053/// \param __a
2054/// A 128-bit vector of [4 x float] containing the values to be stored.
2055static __inline__ void __DEFAULT_FN_ATTRS
2056_mm_storel_pi(__m64 *__p, __m128 __a)
2057{
2058 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2059 struct __mm_storeh_pi_struct {
2060 __mm_storeh_pi_v2f32 __u;
2061 } __attribute__((__packed__, __may_alias__));
2062 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2063}
2064
2065/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2066/// memory location.
2067///
2068/// \headerfile <x86intrin.h>
2069///
2070/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2071///
2072/// \param __p
2073/// A pointer to a 32-bit memory location.
2074/// \param __a
2075/// A 128-bit vector of [4 x float] containing the value to be stored.
2076static __inline__ void __DEFAULT_FN_ATTRS
2077_mm_store_ss(float *__p, __m128 __a)
2078{
2079 struct __mm_store_ss_struct {
2080 float __u;
2081 } __attribute__((__packed__, __may_alias__));
2082 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2083}
2084
2085/// Stores a 128-bit vector of [4 x float] to an unaligned memory
2086/// location.
2087///
2088/// \headerfile <x86intrin.h>
2089///
2090/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2091///
2092/// \param __p
2093/// A pointer to a 128-bit memory location. The address of the memory
2094/// location does not have to be aligned.
2095/// \param __a
2096/// A 128-bit vector of [4 x float] containing the values to be stored.
2097static __inline__ void __DEFAULT_FN_ATTRS
2098_mm_storeu_ps(float *__p, __m128 __a)
2099{
2100 struct __storeu_ps {
2101 __m128_u __v;
2102 } __attribute__((__packed__, __may_alias__));
2103 ((struct __storeu_ps*)__p)->__v = __a;
2104}
2105
2106/// Stores a 128-bit vector of [4 x float] into an aligned memory
2107/// location.
2108///
2109/// \headerfile <x86intrin.h>
2110///
2111/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2112///
2113/// \param __p
2114/// A pointer to a 128-bit memory location. The address of the memory
2115/// location has to be 16-byte aligned.
2116/// \param __a
2117/// A 128-bit vector of [4 x float] containing the values to be stored.
2118static __inline__ void __DEFAULT_FN_ATTRS
2119_mm_store_ps(float *__p, __m128 __a)
2120{
2121 *(__m128*)__p = __a;
2122}
2123
2124/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2125/// four contiguous elements in an aligned memory location.
2126///
2127/// \headerfile <x86intrin.h>
2128///
2129/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2130/// instruction.
2131///
2132/// \param __p
2133/// A pointer to a 128-bit memory location.
2134/// \param __a
2135/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2136/// of the four contiguous elements pointed by \a __p.
2137static __inline__ void __DEFAULT_FN_ATTRS
2138_mm_store1_ps(float *__p, __m128 __a)
2139{
2140 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2142}
2143
2144/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2145/// four contiguous elements in an aligned memory location.
2146///
2147/// \headerfile <x86intrin.h>
2148///
2149/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2150/// instruction.
2151///
2152/// \param __p
2153/// A pointer to a 128-bit memory location.
2154/// \param __a
2155/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2156/// of the four contiguous elements pointed by \a __p.
2157static __inline__ void __DEFAULT_FN_ATTRS
2158_mm_store_ps1(float *__p, __m128 __a)
2159{
2161}
2162
2163/// Stores float values from a 128-bit vector of [4 x float] to an
2164/// aligned memory location in reverse order.
2165///
2166/// \headerfile <x86intrin.h>
2167///
2168/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2169/// instruction.
2170///
2171/// \param __p
2172/// A pointer to a 128-bit memory location. The address of the memory
2173/// location has to be 128-bit aligned.
2174/// \param __a
2175/// A 128-bit vector of [4 x float] containing the values to be stored.
2176static __inline__ void __DEFAULT_FN_ATTRS
2177_mm_storer_ps(float *__p, __m128 __a)
2178{
2179 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2181}
2182
2183#define _MM_HINT_ET0 7
2184#define _MM_HINT_ET1 6
2185#define _MM_HINT_T0 3
2186#define _MM_HINT_T1 2
2187#define _MM_HINT_T2 1
2188#define _MM_HINT_NTA 0
2189
2190#ifndef _MSC_VER
2191/* FIXME: We have to #define this because "sel" must be a constant integer, and
2192 Sema doesn't do any form of constant propagation yet. */
2193
2194/// Loads one cache line of data from the specified address to a location
2195/// closer to the processor.
2196///
2197/// \headerfile <x86intrin.h>
2198///
2199/// \code
2200/// void _mm_prefetch(const void *a, const int sel);
2201/// \endcode
2202///
2203/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2204///
2205/// \param a
2206/// A pointer to a memory location containing a cache line of data.
2207/// \param sel
2208/// A predefined integer constant specifying the type of prefetch
2209/// operation: \n
2210/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2211/// PREFETCHNTA instruction will be generated. \n
2212/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2213/// be generated. \n
2214/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2215/// be generated. \n
2216/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2217/// be generated.
2218#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2219 ((sel) >> 2) & 1, (sel) & 0x3))
2220#endif
2221
2222/// Stores a 64-bit integer in the specified aligned memory location. To
2223/// minimize caching, the data is flagged as non-temporal (unlikely to be
2224/// used again soon).
2225///
2226/// \headerfile <x86intrin.h>
2227///
2228/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2229///
2230/// \param __p
2231/// A pointer to an aligned memory location used to store the register value.
2232/// \param __a
2233/// A 64-bit integer containing the value to be stored.
2234static __inline__ void __DEFAULT_FN_ATTRS_MMX
2235_mm_stream_pi(void *__p, __m64 __a)
2236{
2237 __builtin_ia32_movntq((__m64 *)__p, __a);
2238}
2239
2240/// Moves packed float values from a 128-bit vector of [4 x float] to a
2241/// 128-bit aligned memory location. To minimize caching, the data is flagged
2242/// as non-temporal (unlikely to be used again soon).
2243///
2244/// \headerfile <x86intrin.h>
2245///
2246/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2247///
2248/// \param __p
2249/// A pointer to a 128-bit aligned memory location that will receive the
2250/// single-precision floating-point values.
2251/// \param __a
2252/// A 128-bit vector of [4 x float] containing the values to be moved.
2253static __inline__ void __DEFAULT_FN_ATTRS
2254_mm_stream_ps(void *__p, __m128 __a)
2255{
2256 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2257}
2258
2259#if defined(__cplusplus)
2260extern "C" {
2261#endif
2262
2263/// Forces strong memory ordering (serialization) between store
2264/// instructions preceding this instruction and store instructions following
2265/// this instruction, ensuring the system completes all previous stores
2266/// before executing subsequent stores.
2267///
2268/// \headerfile <x86intrin.h>
2269///
2270/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2271///
2272void _mm_sfence(void);
2273
2274#if defined(__cplusplus)
2275} // extern "C"
2276#endif
2277
2278/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2279/// returns it, as specified by the immediate integer operand.
2280///
2281/// \headerfile <x86intrin.h>
2282///
2283/// \code
2284/// int _mm_extract_pi16(__m64 a, int n);
2285/// \endcode
2286///
2287/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2288///
2289/// \param a
2290/// A 64-bit vector of [4 x i16].
2291/// \param n
2292/// An immediate integer operand that determines which bits are extracted: \n
2293/// 0: Bits [15:0] are copied to the destination. \n
2294/// 1: Bits [31:16] are copied to the destination. \n
2295/// 2: Bits [47:32] are copied to the destination. \n
2296/// 3: Bits [63:48] are copied to the destination.
2297/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2298#define _mm_extract_pi16(a, n) \
2299 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2300
2301/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2302/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2303/// specified by the immediate operand \a n.
2304///
2305/// \headerfile <x86intrin.h>
2306///
2307/// \code
2308/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2309/// \endcode
2310///
2311/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2312///
2313/// \param a
2314/// A 64-bit vector of [4 x i16].
2315/// \param d
2316/// An integer. The lower 16-bit value from this operand is written to the
2317/// destination at the offset specified by operand \a n.
2318/// \param n
2319/// An immediate integer operant that determines which the bits to be used
2320/// in the destination. \n
2321/// 0: Bits [15:0] are copied to the destination. \n
2322/// 1: Bits [31:16] are copied to the destination. \n
2323/// 2: Bits [47:32] are copied to the destination. \n
2324/// 3: Bits [63:48] are copied to the destination. \n
2325/// The remaining bits in the destination are copied from the corresponding
2326/// bits in operand \a a.
2327/// \returns A 64-bit integer vector containing the copied packed data from the
2328/// operands.
2329#define _mm_insert_pi16(a, d, n) \
2330 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2331
2332/// Compares each of the corresponding packed 16-bit integer values of
2333/// the 64-bit integer vectors, and writes the greater value to the
2334/// corresponding bits in the destination.
2335///
2336/// \headerfile <x86intrin.h>
2337///
2338/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2339///
2340/// \param __a
2341/// A 64-bit integer vector containing one of the source operands.
2342/// \param __b
2343/// A 64-bit integer vector containing one of the source operands.
2344/// \returns A 64-bit integer vector containing the comparison results.
2345static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2346_mm_max_pi16(__m64 __a, __m64 __b)
2347{
2348 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2349}
2350
2351/// Compares each of the corresponding packed 8-bit unsigned integer
2352/// values of the 64-bit integer vectors, and writes the greater value to the
2353/// corresponding bits in the destination.
2354///
2355/// \headerfile <x86intrin.h>
2356///
2357/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2358///
2359/// \param __a
2360/// A 64-bit integer vector containing one of the source operands.
2361/// \param __b
2362/// A 64-bit integer vector containing one of the source operands.
2363/// \returns A 64-bit integer vector containing the comparison results.
2364static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2365_mm_max_pu8(__m64 __a, __m64 __b)
2366{
2367 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2368}
2369
2370/// Compares each of the corresponding packed 16-bit integer values of
2371/// the 64-bit integer vectors, and writes the lesser value to the
2372/// corresponding bits in the destination.
2373///
2374/// \headerfile <x86intrin.h>
2375///
2376/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2377///
2378/// \param __a
2379/// A 64-bit integer vector containing one of the source operands.
2380/// \param __b
2381/// A 64-bit integer vector containing one of the source operands.
2382/// \returns A 64-bit integer vector containing the comparison results.
2383static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2384_mm_min_pi16(__m64 __a, __m64 __b)
2385{
2386 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2387}
2388
2389/// Compares each of the corresponding packed 8-bit unsigned integer
2390/// values of the 64-bit integer vectors, and writes the lesser value to the
2391/// corresponding bits in the destination.
2392///
2393/// \headerfile <x86intrin.h>
2394///
2395/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2396///
2397/// \param __a
2398/// A 64-bit integer vector containing one of the source operands.
2399/// \param __b
2400/// A 64-bit integer vector containing one of the source operands.
2401/// \returns A 64-bit integer vector containing the comparison results.
2402static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2403_mm_min_pu8(__m64 __a, __m64 __b)
2404{
2405 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2406}
2407
2408/// Takes the most significant bit from each 8-bit element in a 64-bit
2409/// integer vector to create an 8-bit mask value. Zero-extends the value to
2410/// 32-bit integer and writes it to the destination.
2411///
2412/// \headerfile <x86intrin.h>
2413///
2414/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2415///
2416/// \param __a
2417/// A 64-bit integer vector containing the values with bits to be extracted.
2418/// \returns The most significant bit from each 8-bit element in \a __a,
2419/// written to bits [7:0].
2420static __inline__ int __DEFAULT_FN_ATTRS_MMX
2422{
2423 return __builtin_ia32_pmovmskb((__v8qi)__a);
2424}
2425
2426/// Multiplies packed 16-bit unsigned integer values and writes the
2427/// high-order 16 bits of each 32-bit product to the corresponding bits in
2428/// the destination.
2429///
2430/// \headerfile <x86intrin.h>
2431///
2432/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2433///
2434/// \param __a
2435/// A 64-bit integer vector containing one of the source operands.
2436/// \param __b
2437/// A 64-bit integer vector containing one of the source operands.
2438/// \returns A 64-bit integer vector containing the products of both operands.
2439static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2441{
2442 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2443}
2444
2445/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2446/// destination, as specified by the immediate value operand.
2447///
2448/// \headerfile <x86intrin.h>
2449///
2450/// \code
2451/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2452/// \endcode
2453///
2454/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2455///
2456/// \param a
2457/// A 64-bit integer vector containing the values to be shuffled.
2458/// \param n
2459/// An immediate value containing an 8-bit value specifying which elements to
2460/// copy from \a a. The destinations within the 64-bit destination are
2461/// assigned values as follows: \n
2462/// Bits [1:0] are used to assign values to bits [15:0] in the
2463/// destination. \n
2464/// Bits [3:2] are used to assign values to bits [31:16] in the
2465/// destination. \n
2466/// Bits [5:4] are used to assign values to bits [47:32] in the
2467/// destination. \n
2468/// Bits [7:6] are used to assign values to bits [63:48] in the
2469/// destination. \n
2470/// Bit value assignments: \n
2471/// 00: assigned from bits [15:0] of \a a. \n
2472/// 01: assigned from bits [31:16] of \a a. \n
2473/// 10: assigned from bits [47:32] of \a a. \n
2474/// 11: assigned from bits [63:48] of \a a. \n
2475/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2476/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2477/// <c>[b6, b4, b2, b0]</c>.
2478/// \returns A 64-bit integer vector containing the shuffled values.
2479#define _mm_shuffle_pi16(a, n) \
2480 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2481
2482/// Conditionally copies the values from each 8-bit element in the first
2483/// 64-bit integer vector operand to the specified memory location, as
2484/// specified by the most significant bit in the corresponding element in the
2485/// second 64-bit integer vector operand.
2486///
2487/// To minimize caching, the data is flagged as non-temporal
2488/// (unlikely to be used again soon).
2489///
2490/// \headerfile <x86intrin.h>
2491///
2492/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2493///
2494/// \param __d
2495/// A 64-bit integer vector containing the values with elements to be copied.
2496/// \param __n
2497/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2498/// element determines whether the corresponding element in operand \a __d
2499/// is copied. If the most significant bit of a given element is 1, the
2500/// corresponding element in operand \a __d is copied.
2501/// \param __p
2502/// A pointer to a 64-bit memory location that will receive the conditionally
2503/// copied integer values. The address of the memory location does not have
2504/// to be aligned.
2505static __inline__ void __DEFAULT_FN_ATTRS_MMX
2506_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2507{
2508 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2509}
2510
2511/// Computes the rounded averages of the packed unsigned 8-bit integer
2512/// values and writes the averages to the corresponding bits in the
2513/// destination.
2514///
2515/// \headerfile <x86intrin.h>
2516///
2517/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2518///
2519/// \param __a
2520/// A 64-bit integer vector containing one of the source operands.
2521/// \param __b
2522/// A 64-bit integer vector containing one of the source operands.
2523/// \returns A 64-bit integer vector containing the averages of both operands.
2524static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2525_mm_avg_pu8(__m64 __a, __m64 __b)
2526{
2527 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2528}
2529
2530/// Computes the rounded averages of the packed unsigned 16-bit integer
2531/// values and writes the averages to the corresponding bits in the
2532/// destination.
2533///
2534/// \headerfile <x86intrin.h>
2535///
2536/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2537///
2538/// \param __a
2539/// A 64-bit integer vector containing one of the source operands.
2540/// \param __b
2541/// A 64-bit integer vector containing one of the source operands.
2542/// \returns A 64-bit integer vector containing the averages of both operands.
2543static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2544_mm_avg_pu16(__m64 __a, __m64 __b)
2545{
2546 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2547}
2548
2549/// Subtracts the corresponding 8-bit unsigned integer values of the two
2550/// 64-bit vector operands and computes the absolute value for each of the
2551/// difference. Then sum of the 8 absolute differences is written to the
2552/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2553///
2554/// \headerfile <x86intrin.h>
2555///
2556/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2557///
2558/// \param __a
2559/// A 64-bit integer vector containing one of the source operands.
2560/// \param __b
2561/// A 64-bit integer vector containing one of the source operands.
2562/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2563/// sets of absolute differences between both operands. The upper bits are
2564/// cleared.
2565static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2566_mm_sad_pu8(__m64 __a, __m64 __b)
2567{
2568 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2569}
2570
2571#if defined(__cplusplus)
2572extern "C" {
2573#endif
2574
2575/// Returns the contents of the MXCSR register as a 32-bit unsigned
2576/// integer value.
2577///
2578/// There are several groups of macros associated with this
2579/// intrinsic, including:
2580/// <ul>
2581/// <li>
2582/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2583/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2584/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2585/// _MM_GET_EXCEPTION_STATE().
2586/// </li>
2587/// <li>
2588/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2589/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2590/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2591/// </li>
2592/// <li>
2593/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2594/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2595/// _MM_GET_ROUNDING_MODE().
2596/// </li>
2597/// <li>
2598/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2599/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2600/// </li>
2601/// <li>
2602/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2603/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2604/// _MM_GET_DENORMALS_ZERO_MODE().
2605/// </li>
2606/// </ul>
2607///
2608/// For example, the following expression checks if an overflow exception has
2609/// occurred:
2610/// \code
2611/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2612/// \endcode
2613///
2614/// The following expression gets the current rounding mode:
2615/// \code
2616/// _MM_GET_ROUNDING_MODE()
2617/// \endcode
2618///
2619/// \headerfile <x86intrin.h>
2620///
2621/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2622///
2623/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2624/// register.
2625unsigned int _mm_getcsr(void);
2626
2627/// Sets the MXCSR register with the 32-bit unsigned integer value.
2628///
2629/// There are several groups of macros associated with this intrinsic,
2630/// including:
2631/// <ul>
2632/// <li>
2633/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2634/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2635/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2636/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2637/// </li>
2638/// <li>
2639/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2640/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2641/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2642/// of these macros.
2643/// </li>
2644/// <li>
2645/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2646/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2647/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2648/// </li>
2649/// <li>
2650/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2651/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2652/// one of these macros.
2653/// </li>
2654/// <li>
2655/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2656/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2657/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2658/// </li>
2659/// </ul>
2660///
2661/// For example, the following expression causes subsequent floating-point
2662/// operations to round up:
2663/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2664///
2665/// The following example sets the DAZ and FTZ flags:
2666/// \code
2667/// void setFlags() {
2668/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2669/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2670/// }
2671/// \endcode
2672///
2673/// \headerfile <x86intrin.h>
2674///
2675/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2676///
2677/// \param __i
2678/// A 32-bit unsigned integer value to be written to the MXCSR register.
2679void _mm_setcsr(unsigned int __i);
2680
2681#if defined(__cplusplus)
2682} // extern "C"
2683#endif
2684
2685/// Selects 4 float values from the 128-bit operands of [4 x float], as
2686/// specified by the immediate value operand.
2687///
2688/// \headerfile <x86intrin.h>
2689///
2690/// \code
2691/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2692/// \endcode
2693///
2694/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2695///
2696/// \param a
2697/// A 128-bit vector of [4 x float].
2698/// \param b
2699/// A 128-bit vector of [4 x float].
2700/// \param mask
2701/// An immediate value containing an 8-bit value specifying which elements to
2702/// copy from \a a and \a b. \n
2703/// Bits [3:0] specify the values copied from operand \a a. \n
2704/// Bits [7:4] specify the values copied from operand \a b. \n
2705/// The destinations within the 128-bit destination are assigned values as
2706/// follows: \n
2707/// Bits [1:0] are used to assign values to bits [31:0] in the
2708/// destination. \n
2709/// Bits [3:2] are used to assign values to bits [63:32] in the
2710/// destination. \n
2711/// Bits [5:4] are used to assign values to bits [95:64] in the
2712/// destination. \n
2713/// Bits [7:6] are used to assign values to bits [127:96] in the
2714/// destination. \n
2715/// Bit value assignments: \n
2716/// 00: Bits [31:0] copied from the specified operand. \n
2717/// 01: Bits [63:32] copied from the specified operand. \n
2718/// 10: Bits [95:64] copied from the specified operand. \n
2719/// 11: Bits [127:96] copied from the specified operand. \n
2720/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2721/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2722/// <c>[b6, b4, b2, b0]</c>.
2723/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2724#define _mm_shuffle_ps(a, b, mask) \
2725 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2726 (int)(mask)))
2727
2728/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2729/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2730///
2731/// \headerfile <x86intrin.h>
2732///
2733/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2734///
2735/// \param __a
2736/// A 128-bit vector of [4 x float]. \n
2737/// Bits [95:64] are written to bits [31:0] of the destination. \n
2738/// Bits [127:96] are written to bits [95:64] of the destination.
2739/// \param __b
2740/// A 128-bit vector of [4 x float].
2741/// Bits [95:64] are written to bits [63:32] of the destination. \n
2742/// Bits [127:96] are written to bits [127:96] of the destination.
2743/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2744static __inline__ __m128 __DEFAULT_FN_ATTRS
2745_mm_unpackhi_ps(__m128 __a, __m128 __b)
2746{
2747 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2748}
2749
2750/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2751/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2752///
2753/// \headerfile <x86intrin.h>
2754///
2755/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2756///
2757/// \param __a
2758/// A 128-bit vector of [4 x float]. \n
2759/// Bits [31:0] are written to bits [31:0] of the destination. \n
2760/// Bits [63:32] are written to bits [95:64] of the destination.
2761/// \param __b
2762/// A 128-bit vector of [4 x float]. \n
2763/// Bits [31:0] are written to bits [63:32] of the destination. \n
2764/// Bits [63:32] are written to bits [127:96] of the destination.
2765/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2766static __inline__ __m128 __DEFAULT_FN_ATTRS
2767_mm_unpacklo_ps(__m128 __a, __m128 __b)
2768{
2769 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2770}
2771
2772/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2773/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2774/// 96 bits are set to the upper 96 bits of the first parameter.
2775///
2776/// \headerfile <x86intrin.h>
2777///
2778/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2779/// instruction.
2780///
2781/// \param __a
2782/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2783/// written to the upper 96 bits of the result.
2784/// \param __b
2785/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2786/// written to the lower 32 bits of the result.
2787/// \returns A 128-bit floating-point vector of [4 x float].
2788static __inline__ __m128 __DEFAULT_FN_ATTRS
2789_mm_move_ss(__m128 __a, __m128 __b)
2790{
2791 __a[0] = __b[0];
2792 return __a;
2793}
2794
2795/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2796/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2797/// 64 bits are set to the upper 64 bits of the first parameter.
2798///
2799/// \headerfile <x86intrin.h>
2800///
2801/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2802///
2803/// \param __a
2804/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2805/// written to the upper 64 bits of the result.
2806/// \param __b
2807/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2808/// written to the lower 64 bits of the result.
2809/// \returns A 128-bit floating-point vector of [4 x float].
2810static __inline__ __m128 __DEFAULT_FN_ATTRS
2811_mm_movehl_ps(__m128 __a, __m128 __b)
2812{
2813 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2814}
2815
2816/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2817/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2818/// 64 bits are set to the lower 64 bits of the second parameter.
2819///
2820/// \headerfile <x86intrin.h>
2821///
2822/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2823///
2824/// \param __a
2825/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2826/// written to the lower 64 bits of the result.
2827/// \param __b
2828/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2829/// written to the upper 64 bits of the result.
2830/// \returns A 128-bit floating-point vector of [4 x float].
2831static __inline__ __m128 __DEFAULT_FN_ATTRS
2832_mm_movelh_ps(__m128 __a, __m128 __b)
2833{
2834 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2835}
2836
2837/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2838/// float].
2839///
2840/// \headerfile <x86intrin.h>
2841///
2842/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2843///
2844/// \param __a
2845/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2846/// from the corresponding elements in this operand.
2847/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2848/// values from the operand.
2849static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2851{
2852 __m64 __b, __c;
2853 __m128 __r;
2854
2858 __r = _mm_setzero_ps();
2859 __r = _mm_cvtpi32_ps(__r, __c);
2860 __r = _mm_movelh_ps(__r, __r);
2862 __r = _mm_cvtpi32_ps(__r, __c);
2863
2864 return __r;
2865}
2866
2867/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2868/// 128-bit vector of [4 x float].
2869///
2870/// \headerfile <x86intrin.h>
2871///
2872/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2873///
2874/// \param __a
2875/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2876/// destination are copied from the corresponding elements in this operand.
2877/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2878/// values from the operand.
2879static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2881{
2882 __m64 __b, __c;
2883 __m128 __r;
2884
2887 __r = _mm_setzero_ps();
2888 __r = _mm_cvtpi32_ps(__r, __c);
2889 __r = _mm_movelh_ps(__r, __r);
2891 __r = _mm_cvtpi32_ps(__r, __c);
2892
2893 return __r;
2894}
2895
2896/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2897/// into a 128-bit vector of [4 x float].
2898///
2899/// \headerfile <x86intrin.h>
2900///
2901/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2902///
2903/// \param __a
2904/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2905/// from the corresponding lower 4 elements in this operand.
2906/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2907/// values from the operand.
2908static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2910{
2911 __m64 __b;
2912
2916
2917 return _mm_cvtpi16_ps(__b);
2918}
2919
2920/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2921/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2922///
2923/// \headerfile <x86intrin.h>
2924///
2925/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2926///
2927/// \param __a
2928/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2929/// destination are copied from the corresponding lower 4 elements in this
2930/// operand.
2931/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2932/// values from the source operand.
2933static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2935{
2936 __m64 __b;
2937
2940
2941 return _mm_cvtpi16_ps(__b);
2942}
2943
2944/// Converts the two 32-bit signed integer values from each 64-bit vector
2945/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2946///
2947/// \headerfile <x86intrin.h>
2948///
2949/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2950///
2951/// \param __a
2952/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2953/// copied from the elements in this operand.
2954/// \param __b
2955/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2956/// copied from the elements in this operand.
2957/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2958/// copied and converted values from the first operand. The upper 64 bits
2959/// contain the copied and converted values from the second operand.
2960static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2962{
2963 __m128 __c;
2964
2965 __c = _mm_setzero_ps();
2968
2969 return _mm_cvtpi32_ps(__c, __a);
2970}
2971
2972/// Converts each single-precision floating-point element of a 128-bit
2973/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2974/// packs the results into a 64-bit integer vector of [4 x i16].
2975///
2976/// If the floating-point element is NaN or infinity, or if the
2977/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2978/// it is converted to 0x8000. Otherwise if the floating-point element is
2979/// greater than 0x7FFF, it is converted to 0x7FFF.
2980///
2981/// \headerfile <x86intrin.h>
2982///
2983/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2984///
2985/// \param __a
2986/// A 128-bit floating-point vector of [4 x float].
2987/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2988/// values.
2989static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2991{
2992 __m64 __b, __c;
2993
2997
2998 return _mm_packs_pi32(__b, __c);
2999}
3000
3001/// Converts each single-precision floating-point element of a 128-bit
3002/// floating-point vector of [4 x float] into an 8-bit signed integer, and
3003/// packs the results into the lower 32 bits of a 64-bit integer vector of
3004/// [8 x i8]. The upper 32 bits of the vector are set to 0.
3005///
3006/// If the floating-point element is NaN or infinity, or if the
3007/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3008/// is converted to 0x80. Otherwise if the floating-point element is greater
3009/// than 0x7F, it is converted to 0x7F.
3010///
3011/// \headerfile <x86intrin.h>
3012///
3013/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3014///
3015/// \param __a
3016/// 128-bit floating-point vector of [4 x float].
3017/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3018/// converted values and the uppper 32 bits are set to zero.
3019static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
3021{
3022 __m64 __b, __c;
3023
3026
3027 return _mm_packs_pi16(__b, __c);
3028}
3029
3030/// Extracts the sign bits from each single-precision floating-point
3031/// element of a 128-bit floating-point vector of [4 x float] and returns the
3032/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3033/// to zero.
3034///
3035/// \headerfile <x86intrin.h>
3036///
3037/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3038///
3039/// \param __a
3040/// A 128-bit floating-point vector of [4 x float].
3041/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3042/// single-precision floating-point element of the parameter. Bits [31:4] are
3043/// set to zero.
3044static __inline__ int __DEFAULT_FN_ATTRS
3046{
3047 return __builtin_ia32_movmskps((__v4sf)__a);
3048}
3049
3050/* Compare */
3051#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3052#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3053#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3054#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3055#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3056#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3057#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3058#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3059
3060/// Compares each of the corresponding values of two 128-bit vectors of
3061/// [4 x float], using the operation specified by the immediate integer
3062/// operand.
3063///
3064/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3065/// If either value in a comparison is NaN, comparisons that are ordered
3066/// return false, and comparisons that are unordered return true.
3067///
3068/// \headerfile <x86intrin.h>
3069///
3070/// \code
3071/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3072/// \endcode
3073///
3074/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3075///
3076/// \param a
3077/// A 128-bit vector of [4 x float].
3078/// \param b
3079/// A 128-bit vector of [4 x float].
3080/// \param c
3081/// An immediate integer operand, with bits [4:0] specifying which comparison
3082/// operation to use: \n
3083/// 0x00: Equal (ordered, non-signaling) \n
3084/// 0x01: Less-than (ordered, signaling) \n
3085/// 0x02: Less-than-or-equal (ordered, signaling) \n
3086/// 0x03: Unordered (non-signaling) \n
3087/// 0x04: Not-equal (unordered, non-signaling) \n
3088/// 0x05: Not-less-than (unordered, signaling) \n
3089/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3090/// 0x07: Ordered (non-signaling) \n
3091/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3092#define _mm_cmp_ps(a, b, c) \
3093 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3094
3095/// Compares each of the corresponding scalar values of two 128-bit
3096/// vectors of [4 x float], using the operation specified by the immediate
3097/// integer operand.
3098///
3099/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3100/// If either value in a comparison is NaN, comparisons that are ordered
3101/// return false, and comparisons that are unordered return true.
3102///
3103/// \headerfile <x86intrin.h>
3104///
3105/// \code
3106/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3107/// \endcode
3108///
3109/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3110///
3111/// \param a
3112/// A 128-bit vector of [4 x float].
3113/// \param b
3114/// A 128-bit vector of [4 x float].
3115/// \param c
3116/// An immediate integer operand, with bits [4:0] specifying which comparison
3117/// operation to use: \n
3118/// 0x00: Equal (ordered, non-signaling) \n
3119/// 0x01: Less-than (ordered, signaling) \n
3120/// 0x02: Less-than-or-equal (ordered, signaling) \n
3121/// 0x03: Unordered (non-signaling) \n
3122/// 0x04: Not-equal (unordered, non-signaling) \n
3123/// 0x05: Not-less-than (unordered, signaling) \n
3124/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3125/// 0x07: Ordered (non-signaling) \n
3126/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3127#define _mm_cmp_ss(a, b, c) \
3128 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3129
3130#define _MM_ALIGN16 __attribute__((aligned(16)))
3131
3132#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3133
3134#define _MM_EXCEPT_INVALID (0x0001U)
3135#define _MM_EXCEPT_DENORM (0x0002U)
3136#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3137#define _MM_EXCEPT_OVERFLOW (0x0008U)
3138#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3139#define _MM_EXCEPT_INEXACT (0x0020U)
3140#define _MM_EXCEPT_MASK (0x003fU)
3141
3142#define _MM_MASK_INVALID (0x0080U)
3143#define _MM_MASK_DENORM (0x0100U)
3144#define _MM_MASK_DIV_ZERO (0x0200U)
3145#define _MM_MASK_OVERFLOW (0x0400U)
3146#define _MM_MASK_UNDERFLOW (0x0800U)
3147#define _MM_MASK_INEXACT (0x1000U)
3148#define _MM_MASK_MASK (0x1f80U)
3149
3150#define _MM_ROUND_NEAREST (0x0000U)
3151#define _MM_ROUND_DOWN (0x2000U)
3152#define _MM_ROUND_UP (0x4000U)
3153#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3154#define _MM_ROUND_MASK (0x6000U)
3155
3156#define _MM_FLUSH_ZERO_MASK (0x8000U)
3157#define _MM_FLUSH_ZERO_ON (0x8000U)
3158#define _MM_FLUSH_ZERO_OFF (0x0000U)
3159
3160#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3161#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3162#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3163#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3164
3165#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3166#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3167#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3168#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3169
3170#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3171do { \
3172 __m128 tmp3, tmp2, tmp1, tmp0; \
3173 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3174 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3175 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3176 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3177 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3178 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3179 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3180 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3181} while (0)
3182
3183/* Aliases for compatibility. */
3184#define _m_pextrw _mm_extract_pi16
3185#define _m_pinsrw _mm_insert_pi16
3186#define _m_pmaxsw _mm_max_pi16
3187#define _m_pmaxub _mm_max_pu8
3188#define _m_pminsw _mm_min_pi16
3189#define _m_pminub _mm_min_pu8
3190#define _m_pmovmskb _mm_movemask_pi8
3191#define _m_pmulhuw _mm_mulhi_pu16
3192#define _m_pshufw _mm_shuffle_pi16
3193#define _m_maskmovq _mm_maskmove_si64
3194#define _m_pavgb _mm_avg_pu8
3195#define _m_pavgw _mm_avg_pu16
3196#define _m_psadbw _mm_sad_pu8
3197#define _m_ _mm_
3198
3199#undef __DEFAULT_FN_ATTRS
3200#undef __DEFAULT_FN_ATTRS_MMX
3201
3202/* Ugly hack for backwards-compatibility (compatible with gcc) */
3203#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3204#include <emmintrin.h>
3205#endif
3206
3207#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ uint32_t uint32_t __y
Definition: arm_acle.h:122
static __inline__ void int __a
Definition: emmintrin.h:4057
struct __storeu_i16 *__P __v
Definition: immintrin.h:480
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:228
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:276
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1245
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1223
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts, with saturation, 32-bit signed integers from both 64-bit integer vector parameters of [2 x ...
Definition: mmintrin.h:153
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:299
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1280
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:128
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1172
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1494
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:257
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:573
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:222
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:204
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:948
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:525
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2018
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1452
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1911
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:420
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:551
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1406
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:504
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:78
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1838
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2440
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:769
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1148
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1472
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:442
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2177
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1891
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1562
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:846
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1267
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:599
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1628
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2235
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2850
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:310
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3020
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2056
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1291
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:721
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:1099
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2138
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2880
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1948
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:163
#define __DEFAULT_FN_ATTRS
Definition: xmmintrin.h:35
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:402
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:293
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1339
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2544
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1124
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1766
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2098
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:100
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:121
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1815
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2832
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:356
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2254
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1196
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1384
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:671
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:2003
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1315
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2745
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2077
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:896
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1739
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:479
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:274
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2789
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1929
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2119
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:460
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:239
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition: xmmintrin.h:747
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1718
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2365
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:143
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2384
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1605
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1583
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1516
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:3045
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2961
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2811
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1877
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1000
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:819
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2035
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:921
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:973
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1024
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:647
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1701
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1243
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:58
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1975
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2421
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:796
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2158
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:335
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2934
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:621
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
#define __DEFAULT_FN_ATTRS_MMX
Definition: xmmintrin.h:38
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2525
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2909
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2767
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:381
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2506
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2346
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1362
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2990
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1220
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2403
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:869
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2566
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1051
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:697
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1075
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1855
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1793
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:185