clang 22.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef float __v4sf __attribute__((__vector_size__(16)));
20typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
23
24/* Unsigned types */
25typedef unsigned int __v4su __attribute__((__vector_size__(16)));
26typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
27typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
28
29/* This header should only be included in a hosted environment as it depends on
30 * a standard library to provide allocation routines. */
31#if __STDC_HOSTED__
32#include <mm_malloc.h>
33#endif
34
35/* Define the default attributes for the functions in this file. */
36#define __DEFAULT_FN_ATTRS \
37 __attribute__((__always_inline__, __nodebug__, __target__("sse"), \
38 __min_vector_width__(128)))
39#define __DEFAULT_FN_ATTRS_SSE2 \
40 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
41 __min_vector_width__(128)))
42
43#if defined(__cplusplus) && (__cplusplus >= 201103L)
44#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
45#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
46#else
47#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
48#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
49#endif
50
51#define __trunc64(x) \
52 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
53#define __zext128(x) \
54 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
55 1, 2, 3)
56#define __anyext128(x) \
57 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
58 1, -1, -1)
59#define __zeroupper64(x) \
60 (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
61 1, 4, 5)
62
63/// Adds the 32-bit float values in the low-order bits of the operands.
64///
65/// \headerfile <x86intrin.h>
66///
67/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
68///
69/// \param __a
70/// A 128-bit vector of [4 x float] containing one of the source operands.
71/// The lower 32 bits of this operand are used in the calculation.
72/// \param __b
73/// A 128-bit vector of [4 x float] containing one of the source operands.
74/// The lower 32 bits of this operand are used in the calculation.
75/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
76/// of the lower 32 bits of both operands. The upper 96 bits are copied from
77/// the upper 96 bits of the first source operand.
78static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
79_mm_add_ss(__m128 __a, __m128 __b) {
80 __a[0] += __b[0];
81 return __a;
82}
83
84/// Adds two 128-bit vectors of [4 x float], and returns the results of
85/// the addition.
86///
87/// \headerfile <x86intrin.h>
88///
89/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
90///
91/// \param __a
92/// A 128-bit vector of [4 x float] containing one of the source operands.
93/// \param __b
94/// A 128-bit vector of [4 x float] containing one of the source operands.
95/// \returns A 128-bit vector of [4 x float] containing the sums of both
96/// operands.
97static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
98_mm_add_ps(__m128 __a, __m128 __b) {
99 return (__m128)((__v4sf)__a + (__v4sf)__b);
100}
101
102/// Subtracts the 32-bit float value in the low-order bits of the second
103/// operand from the corresponding value in the first operand.
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
108///
109/// \param __a
110/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
111/// of this operand are used in the calculation.
112/// \param __b
113/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
114/// bits of this operand are used in the calculation.
115/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
116/// difference of the lower 32 bits of both operands. The upper 96 bits are
117/// copied from the upper 96 bits of the first source operand.
118static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
119_mm_sub_ss(__m128 __a, __m128 __b) {
120 __a[0] -= __b[0];
121 return __a;
122}
123
124/// Subtracts each of the values of the second operand from the first
125/// operand, both of which are 128-bit vectors of [4 x float] and returns
126/// the results of the subtraction.
127///
128/// \headerfile <x86intrin.h>
129///
130/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
131///
132/// \param __a
133/// A 128-bit vector of [4 x float] containing the minuend.
134/// \param __b
135/// A 128-bit vector of [4 x float] containing the subtrahend.
136/// \returns A 128-bit vector of [4 x float] containing the differences between
137/// both operands.
138static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
139_mm_sub_ps(__m128 __a, __m128 __b) {
140 return (__m128)((__v4sf)__a - (__v4sf)__b);
141}
142
143/// Multiplies two 32-bit float values in the low-order bits of the
144/// operands.
145///
146/// \headerfile <x86intrin.h>
147///
148/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
149///
150/// \param __a
151/// A 128-bit vector of [4 x float] containing one of the source operands.
152/// The lower 32 bits of this operand are used in the calculation.
153/// \param __b
154/// A 128-bit vector of [4 x float] containing one of the source operands.
155/// The lower 32 bits of this operand are used in the calculation.
156/// \returns A 128-bit vector of [4 x float] containing the product of the lower
157/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
158/// bits of the first source operand.
159static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
160_mm_mul_ss(__m128 __a, __m128 __b) {
161 __a[0] *= __b[0];
162 return __a;
163}
164
165/// Multiplies two 128-bit vectors of [4 x float] and returns the
166/// results of the multiplication.
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
171///
172/// \param __a
173/// A 128-bit vector of [4 x float] containing one of the source operands.
174/// \param __b
175/// A 128-bit vector of [4 x float] containing one of the source operands.
176/// \returns A 128-bit vector of [4 x float] containing the products of both
177/// operands.
178static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
179_mm_mul_ps(__m128 __a, __m128 __b) {
180 return (__m128)((__v4sf)__a * (__v4sf)__b);
181}
182
183/// Divides the value in the low-order 32 bits of the first operand by
184/// the corresponding value in the second operand.
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
189///
190/// \param __a
191/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
192/// bits of this operand are used in the calculation.
193/// \param __b
194/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
195/// of this operand are used in the calculation.
196/// \returns A 128-bit vector of [4 x float] containing the quotients of the
197/// lower 32 bits of both operands. The upper 96 bits are copied from the
198/// upper 96 bits of the first source operand.
199static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
200_mm_div_ss(__m128 __a, __m128 __b) {
201 __a[0] /= __b[0];
202 return __a;
203}
204
205/// Divides two 128-bit vectors of [4 x float].
206///
207/// \headerfile <x86intrin.h>
208///
209/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
210///
211/// \param __a
212/// A 128-bit vector of [4 x float] containing the dividend.
213/// \param __b
214/// A 128-bit vector of [4 x float] containing the divisor.
215/// \returns A 128-bit vector of [4 x float] containing the quotients of both
216/// operands.
217static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
218_mm_div_ps(__m128 __a, __m128 __b) {
219 return (__m128)((__v4sf)__a / (__v4sf)__b);
220}
221
222/// Calculates the square root of the value stored in the low-order bits
223/// of a 128-bit vector of [4 x float].
224///
225/// \headerfile <x86intrin.h>
226///
227/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
228///
229/// \param __a
230/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
231/// used in the calculation.
232/// \returns A 128-bit vector of [4 x float] containing the square root of the
233/// value in the low-order bits of the operand.
234static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
235 __a[0] = __builtin_elementwise_sqrt(__a[0]);
236 return __a;
237}
238
239/// Calculates the square roots of the values stored in a 128-bit vector
240/// of [4 x float].
241///
242/// \headerfile <x86intrin.h>
243///
244/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
245///
246/// \param __a
247/// A 128-bit vector of [4 x float].
248/// \returns A 128-bit vector of [4 x float] containing the square roots of the
249/// values in the operand.
250static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
251 return __builtin_elementwise_sqrt(__a);
252}
253
254/// Calculates the approximate reciprocal of the value stored in the
255/// low-order bits of a 128-bit vector of [4 x float].
256///
257/// \headerfile <x86intrin.h>
258///
259/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
260///
261/// \param __a
262/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
263/// used in the calculation.
264/// \returns A 128-bit vector of [4 x float] containing the approximate
265/// reciprocal of the value in the low-order bits of the operand.
266static __inline__ __m128 __DEFAULT_FN_ATTRS
268{
269 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
270}
271
272/// Calculates the approximate reciprocals of the values stored in a
273/// 128-bit vector of [4 x float].
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
278///
279/// \param __a
280/// A 128-bit vector of [4 x float].
281/// \returns A 128-bit vector of [4 x float] containing the approximate
282/// reciprocals of the values in the operand.
283static __inline__ __m128 __DEFAULT_FN_ATTRS
285{
286 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
287}
288
289/// Calculates the approximate reciprocal of the square root of the value
290/// stored in the low-order bits of a 128-bit vector of [4 x float].
291///
292/// \headerfile <x86intrin.h>
293///
294/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
295///
296/// \param __a
297/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
298/// used in the calculation.
299/// \returns A 128-bit vector of [4 x float] containing the approximate
300/// reciprocal of the square root of the value in the low-order bits of the
301/// operand.
302static __inline__ __m128 __DEFAULT_FN_ATTRS
304{
305 return __builtin_ia32_rsqrtss((__v4sf)__a);
306}
307
308/// Calculates the approximate reciprocals of the square roots of the
309/// values stored in a 128-bit vector of [4 x float].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
314///
315/// \param __a
316/// A 128-bit vector of [4 x float].
317/// \returns A 128-bit vector of [4 x float] containing the approximate
318/// reciprocals of the square roots of the values in the operand.
319static __inline__ __m128 __DEFAULT_FN_ATTRS
321{
322 return __builtin_ia32_rsqrtps((__v4sf)__a);
323}
324
325/// Compares two 32-bit float values in the low-order bits of both
326/// operands and returns the lesser value in the low-order bits of the
327/// vector of [4 x float].
328///
329/// If either value in a comparison is NaN, returns the value from \a __b.
330///
331/// \headerfile <x86intrin.h>
332///
333/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
334///
335/// \param __a
336/// A 128-bit vector of [4 x float] containing one of the operands. The lower
337/// 32 bits of this operand are used in the comparison.
338/// \param __b
339/// A 128-bit vector of [4 x float] containing one of the operands. The lower
340/// 32 bits of this operand are used in the comparison.
341/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
342/// minimum value between both operands. The upper 96 bits are copied from
343/// the upper 96 bits of the first source operand.
344static __inline__ __m128 __DEFAULT_FN_ATTRS
345_mm_min_ss(__m128 __a, __m128 __b)
346{
347 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
348}
349
350/// Compares two 128-bit vectors of [4 x float] and returns the lesser
351/// of each pair of values.
352///
353/// If either value in a comparison is NaN, returns the value from \a __b.
354///
355/// \headerfile <x86intrin.h>
356///
357/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
358///
359/// \param __a
360/// A 128-bit vector of [4 x float] containing one of the operands.
361/// \param __b
362/// A 128-bit vector of [4 x float] containing one of the operands.
363/// \returns A 128-bit vector of [4 x float] containing the minimum values
364/// between both operands.
365static __inline__ __m128 __DEFAULT_FN_ATTRS
366_mm_min_ps(__m128 __a, __m128 __b)
367{
368 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
369}
370
371/// Compares two 32-bit float values in the low-order bits of both
372/// operands and returns the greater value in the low-order bits of a 128-bit
373/// vector of [4 x float].
374///
375/// If either value in a comparison is NaN, returns the value from \a __b.
376///
377/// \headerfile <x86intrin.h>
378///
379/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
380///
381/// \param __a
382/// A 128-bit vector of [4 x float] containing one of the operands. The lower
383/// 32 bits of this operand are used in the comparison.
384/// \param __b
385/// A 128-bit vector of [4 x float] containing one of the operands. The lower
386/// 32 bits of this operand are used in the comparison.
387/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
388/// maximum value between both operands. The upper 96 bits are copied from
389/// the upper 96 bits of the first source operand.
390static __inline__ __m128 __DEFAULT_FN_ATTRS
391_mm_max_ss(__m128 __a, __m128 __b)
392{
393 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
394}
395
396/// Compares two 128-bit vectors of [4 x float] and returns the greater
397/// of each pair of values.
398///
399/// If either value in a comparison is NaN, returns the value from \a __b.
400///
401/// \headerfile <x86intrin.h>
402///
403/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
404///
405/// \param __a
406/// A 128-bit vector of [4 x float] containing one of the operands.
407/// \param __b
408/// A 128-bit vector of [4 x float] containing one of the operands.
409/// \returns A 128-bit vector of [4 x float] containing the maximum values
410/// between both operands.
411static __inline__ __m128 __DEFAULT_FN_ATTRS
412_mm_max_ps(__m128 __a, __m128 __b)
413{
414 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
415}
416
417/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
418///
419/// \headerfile <x86intrin.h>
420///
421/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
422///
423/// \param __a
424/// A 128-bit vector containing one of the source operands.
425/// \param __b
426/// A 128-bit vector containing one of the source operands.
427/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
428/// values between both operands.
429static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
430_mm_and_ps(__m128 __a, __m128 __b) {
431 return (__m128)((__v4su)__a & (__v4su)__b);
432}
433
434/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
435/// the one's complement of the values contained in the first source
436/// operand.
437///
438/// \headerfile <x86intrin.h>
439///
440/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
441///
442/// \param __a
443/// A 128-bit vector of [4 x float] containing the first source operand. The
444/// one's complement of this value is used in the bitwise AND.
445/// \param __b
446/// A 128-bit vector of [4 x float] containing the second source operand.
447/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
448/// one's complement of the first operand and the values in the second
449/// operand.
450static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
451_mm_andnot_ps(__m128 __a, __m128 __b) {
452 return (__m128)(~(__v4su)__a & (__v4su)__b);
453}
454
455/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
456///
457/// \headerfile <x86intrin.h>
458///
459/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
460///
461/// \param __a
462/// A 128-bit vector of [4 x float] containing one of the source operands.
463/// \param __b
464/// A 128-bit vector of [4 x float] containing one of the source operands.
465/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
466/// values between both operands.
467static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
468_mm_or_ps(__m128 __a, __m128 __b) {
469 return (__m128)((__v4su)__a | (__v4su)__b);
470}
471
472/// Performs a bitwise exclusive OR of two 128-bit vectors of
473/// [4 x float].
474///
475/// \headerfile <x86intrin.h>
476///
477/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
478///
479/// \param __a
480/// A 128-bit vector of [4 x float] containing one of the source operands.
481/// \param __b
482/// A 128-bit vector of [4 x float] containing one of the source operands.
483/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
484/// of the values between both operands.
485static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
486_mm_xor_ps(__m128 __a, __m128 __b) {
487 return (__m128)((__v4su)__a ^ (__v4su)__b);
488}
489
490/// Compares two 32-bit float values in the low-order bits of both
491/// operands for equality.
492///
493/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
494/// low-order bits of a vector [4 x float].
495/// If either value in a comparison is NaN, returns false.
496///
497/// \headerfile <x86intrin.h>
498///
499/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
500///
501/// \param __a
502/// A 128-bit vector of [4 x float] containing one of the operands. The lower
503/// 32 bits of this operand are used in the comparison.
504/// \param __b
505/// A 128-bit vector of [4 x float] containing one of the operands. The lower
506/// 32 bits of this operand are used in the comparison.
507/// \returns A 128-bit vector of [4 x float] containing the comparison results
508/// in the low-order bits.
509static __inline__ __m128 __DEFAULT_FN_ATTRS
510_mm_cmpeq_ss(__m128 __a, __m128 __b)
511{
512 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
513}
514
515/// Compares each of the corresponding 32-bit float values of the
516/// 128-bit vectors of [4 x float] for equality.
517///
518/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
519/// If either value in a comparison is NaN, returns false.
520///
521/// \headerfile <x86intrin.h>
522///
523/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
524///
525/// \param __a
526/// A 128-bit vector of [4 x float].
527/// \param __b
528/// A 128-bit vector of [4 x float].
529/// \returns A 128-bit vector of [4 x float] containing the comparison results.
530static __inline__ __m128 __DEFAULT_FN_ATTRS
531_mm_cmpeq_ps(__m128 __a, __m128 __b)
532{
533 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
534}
535
536/// Compares two 32-bit float values in the low-order bits of both
537/// operands to determine if the value in the first operand is less than the
538/// corresponding value in the second operand.
539///
540/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
541/// low-order bits of a vector of [4 x float].
542/// If either value in a comparison is NaN, returns false.
543///
544/// \headerfile <x86intrin.h>
545///
546/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
547///
548/// \param __a
549/// A 128-bit vector of [4 x float] containing one of the operands. The lower
550/// 32 bits of this operand are used in the comparison.
551/// \param __b
552/// A 128-bit vector of [4 x float] containing one of the operands. The lower
553/// 32 bits of this operand are used in the comparison.
554/// \returns A 128-bit vector of [4 x float] containing the comparison results
555/// in the low-order bits.
556static __inline__ __m128 __DEFAULT_FN_ATTRS
557_mm_cmplt_ss(__m128 __a, __m128 __b)
558{
559 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
560}
561
562/// Compares each of the corresponding 32-bit float values of the
563/// 128-bit vectors of [4 x float] to determine if the values in the first
564/// operand are less than those in the second operand.
565///
566/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
567/// If either value in a comparison is NaN, returns false.
568///
569/// \headerfile <x86intrin.h>
570///
571/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
572///
573/// \param __a
574/// A 128-bit vector of [4 x float].
575/// \param __b
576/// A 128-bit vector of [4 x float].
577/// \returns A 128-bit vector of [4 x float] containing the comparison results.
578static __inline__ __m128 __DEFAULT_FN_ATTRS
579_mm_cmplt_ps(__m128 __a, __m128 __b)
580{
581 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
582}
583
584/// Compares two 32-bit float values in the low-order bits of both
585/// operands to determine if the value in the first operand is less than or
586/// equal to the corresponding value in the second operand.
587///
588/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
589/// the low-order bits of a vector of [4 x float].
590/// If either value in a comparison is NaN, returns false.
591///
592/// \headerfile <x86intrin.h>
593///
594/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
595///
596/// \param __a
597/// A 128-bit vector of [4 x float] containing one of the operands. The lower
598/// 32 bits of this operand are used in the comparison.
599/// \param __b
600/// A 128-bit vector of [4 x float] containing one of the operands. The lower
601/// 32 bits of this operand are used in the comparison.
602/// \returns A 128-bit vector of [4 x float] containing the comparison results
603/// in the low-order bits.
604static __inline__ __m128 __DEFAULT_FN_ATTRS
605_mm_cmple_ss(__m128 __a, __m128 __b)
606{
607 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
608}
609
610/// Compares each of the corresponding 32-bit float values of the
611/// 128-bit vectors of [4 x float] to determine if the values in the first
612/// operand are less than or equal to those in the second operand.
613///
614/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
615/// If either value in a comparison is NaN, returns false.
616///
617/// \headerfile <x86intrin.h>
618///
619/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
620///
621/// \param __a
622/// A 128-bit vector of [4 x float].
623/// \param __b
624/// A 128-bit vector of [4 x float].
625/// \returns A 128-bit vector of [4 x float] containing the comparison results.
626static __inline__ __m128 __DEFAULT_FN_ATTRS
627_mm_cmple_ps(__m128 __a, __m128 __b)
628{
629 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
630}
631
632/// Compares two 32-bit float values in the low-order bits of both
633/// operands to determine if the value in the first operand is greater than
634/// the corresponding value in the second operand.
635///
636/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
637/// low-order bits of a vector of [4 x float].
638/// If either value in a comparison is NaN, returns false.
639///
640/// \headerfile <x86intrin.h>
641///
642/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
643///
644/// \param __a
645/// A 128-bit vector of [4 x float] containing one of the operands. The lower
646/// 32 bits of this operand are used in the comparison.
647/// \param __b
648/// A 128-bit vector of [4 x float] containing one of the operands. The lower
649/// 32 bits of this operand are used in the comparison.
650/// \returns A 128-bit vector of [4 x float] containing the comparison results
651/// in the low-order bits.
652static __inline__ __m128 __DEFAULT_FN_ATTRS
653_mm_cmpgt_ss(__m128 __a, __m128 __b)
654{
655 return (__m128)__builtin_shufflevector((__v4sf)__a,
656 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
657 4, 1, 2, 3);
658}
659
660/// Compares each of the corresponding 32-bit float values of the
661/// 128-bit vectors of [4 x float] to determine if the values in the first
662/// operand are greater than those in the second operand.
663///
664/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
665/// If either value in a comparison is NaN, returns false.
666///
667/// \headerfile <x86intrin.h>
668///
669/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
670///
671/// \param __a
672/// A 128-bit vector of [4 x float].
673/// \param __b
674/// A 128-bit vector of [4 x float].
675/// \returns A 128-bit vector of [4 x float] containing the comparison results.
676static __inline__ __m128 __DEFAULT_FN_ATTRS
677_mm_cmpgt_ps(__m128 __a, __m128 __b)
678{
679 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
680}
681
682/// Compares two 32-bit float values in the low-order bits of both
683/// operands to determine if the value in the first operand is greater than
684/// or equal to the corresponding value in the second operand.
685///
686/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
687/// low-order bits of a vector of [4 x float].
688/// If either value in a comparison is NaN, returns false.
689///
690/// \headerfile <x86intrin.h>
691///
692/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
693///
694/// \param __a
695/// A 128-bit vector of [4 x float] containing one of the operands. The lower
696/// 32 bits of this operand are used in the comparison.
697/// \param __b
698/// A 128-bit vector of [4 x float] containing one of the operands. The lower
699/// 32 bits of this operand are used in the comparison.
700/// \returns A 128-bit vector of [4 x float] containing the comparison results
701/// in the low-order bits.
702static __inline__ __m128 __DEFAULT_FN_ATTRS
703_mm_cmpge_ss(__m128 __a, __m128 __b)
704{
705 return (__m128)__builtin_shufflevector((__v4sf)__a,
706 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
707 4, 1, 2, 3);
708}
709
710/// Compares each of the corresponding 32-bit float values of the
711/// 128-bit vectors of [4 x float] to determine if the values in the first
712/// operand are greater than or equal to those in the second operand.
713///
714/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
715/// If either value in a comparison is NaN, returns false.
716///
717/// \headerfile <x86intrin.h>
718///
719/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
720///
721/// \param __a
722/// A 128-bit vector of [4 x float].
723/// \param __b
724/// A 128-bit vector of [4 x float].
725/// \returns A 128-bit vector of [4 x float] containing the comparison results.
726static __inline__ __m128 __DEFAULT_FN_ATTRS
727_mm_cmpge_ps(__m128 __a, __m128 __b)
728{
729 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
730}
731
732/// Compares two 32-bit float values in the low-order bits of both operands
733/// for inequality.
734///
735/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
736/// low-order bits of a vector of [4 x float].
737/// If either value in a comparison is NaN, returns true.
738///
739/// \headerfile <x86intrin.h>
740///
741/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
742/// instructions.
743///
744/// \param __a
745/// A 128-bit vector of [4 x float] containing one of the operands. The lower
746/// 32 bits of this operand are used in the comparison.
747/// \param __b
748/// A 128-bit vector of [4 x float] containing one of the operands. The lower
749/// 32 bits of this operand are used in the comparison.
750/// \returns A 128-bit vector of [4 x float] containing the comparison results
751/// in the low-order bits.
752static __inline__ __m128 __DEFAULT_FN_ATTRS
753_mm_cmpneq_ss(__m128 __a, __m128 __b)
754{
755 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
756}
757
758/// Compares each of the corresponding 32-bit float values of the
759/// 128-bit vectors of [4 x float] for inequality.
760///
761/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
762/// If either value in a comparison is NaN, returns true.
763///
764/// \headerfile <x86intrin.h>
765///
766/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
767/// instructions.
768///
769/// \param __a
770/// A 128-bit vector of [4 x float].
771/// \param __b
772/// A 128-bit vector of [4 x float].
773/// \returns A 128-bit vector of [4 x float] containing the comparison results.
774static __inline__ __m128 __DEFAULT_FN_ATTRS
775_mm_cmpneq_ps(__m128 __a, __m128 __b)
776{
777 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
778}
779
780/// Compares two 32-bit float values in the low-order bits of both
781/// operands to determine if the value in the first operand is not less than
782/// the corresponding value in the second operand.
783///
784/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
785/// low-order bits of a vector of [4 x float].
786/// If either value in a comparison is NaN, returns true.
787///
788/// \headerfile <x86intrin.h>
789///
790/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
791/// instructions.
792///
793/// \param __a
794/// A 128-bit vector of [4 x float] containing one of the operands. The lower
795/// 32 bits of this operand are used in the comparison.
796/// \param __b
797/// A 128-bit vector of [4 x float] containing one of the operands. The lower
798/// 32 bits of this operand are used in the comparison.
799/// \returns A 128-bit vector of [4 x float] containing the comparison results
800/// in the low-order bits.
801static __inline__ __m128 __DEFAULT_FN_ATTRS
802_mm_cmpnlt_ss(__m128 __a, __m128 __b)
803{
804 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
805}
806
807/// Compares each of the corresponding 32-bit float values of the
808/// 128-bit vectors of [4 x float] to determine if the values in the first
809/// operand are not less than those in the second operand.
810///
811/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
812/// If either value in a comparison is NaN, returns true.
813///
814/// \headerfile <x86intrin.h>
815///
816/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
817/// instructions.
818///
819/// \param __a
820/// A 128-bit vector of [4 x float].
821/// \param __b
822/// A 128-bit vector of [4 x float].
823/// \returns A 128-bit vector of [4 x float] containing the comparison results.
824static __inline__ __m128 __DEFAULT_FN_ATTRS
825_mm_cmpnlt_ps(__m128 __a, __m128 __b)
826{
827 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
828}
829
830/// Compares two 32-bit float values in the low-order bits of both
831/// operands to determine if the value in the first operand is not less than
832/// or equal to the corresponding value in the second operand.
833///
834/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
835/// low-order bits of a vector of [4 x float].
836/// If either value in a comparison is NaN, returns true.
837///
838/// \headerfile <x86intrin.h>
839///
840/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
841/// instructions.
842///
843/// \param __a
844/// A 128-bit vector of [4 x float] containing one of the operands. The lower
845/// 32 bits of this operand are used in the comparison.
846/// \param __b
847/// A 128-bit vector of [4 x float] containing one of the operands. The lower
848/// 32 bits of this operand are used in the comparison.
849/// \returns A 128-bit vector of [4 x float] containing the comparison results
850/// in the low-order bits.
851static __inline__ __m128 __DEFAULT_FN_ATTRS
852_mm_cmpnle_ss(__m128 __a, __m128 __b)
853{
854 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
855}
856
857/// Compares each of the corresponding 32-bit float values of the
858/// 128-bit vectors of [4 x float] to determine if the values in the first
859/// operand are not less than or equal to those in the second operand.
860///
861/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
862/// If either value in a comparison is NaN, returns true.
863///
864/// \headerfile <x86intrin.h>
865///
866/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
867/// instructions.
868///
869/// \param __a
870/// A 128-bit vector of [4 x float].
871/// \param __b
872/// A 128-bit vector of [4 x float].
873/// \returns A 128-bit vector of [4 x float] containing the comparison results.
874static __inline__ __m128 __DEFAULT_FN_ATTRS
875_mm_cmpnle_ps(__m128 __a, __m128 __b)
876{
877 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
878}
879
880/// Compares two 32-bit float values in the low-order bits of both
881/// operands to determine if the value in the first operand is not greater
882/// than the corresponding value in the second operand.
883///
884/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
885/// low-order bits of a vector of [4 x float].
886/// If either value in a comparison is NaN, returns true.
887///
888/// \headerfile <x86intrin.h>
889///
890/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
891/// instructions.
892///
893/// \param __a
894/// A 128-bit vector of [4 x float] containing one of the operands. The lower
895/// 32 bits of this operand are used in the comparison.
896/// \param __b
897/// A 128-bit vector of [4 x float] containing one of the operands. The lower
898/// 32 bits of this operand are used in the comparison.
899/// \returns A 128-bit vector of [4 x float] containing the comparison results
900/// in the low-order bits.
901static __inline__ __m128 __DEFAULT_FN_ATTRS
902_mm_cmpngt_ss(__m128 __a, __m128 __b)
903{
904 return (__m128)__builtin_shufflevector((__v4sf)__a,
905 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
906 4, 1, 2, 3);
907}
908
909/// Compares each of the corresponding 32-bit float values of the
910/// 128-bit vectors of [4 x float] to determine if the values in the first
911/// operand are not greater than those in the second operand.
912///
913/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
914/// If either value in a comparison is NaN, returns true.
915///
916/// \headerfile <x86intrin.h>
917///
918/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
919/// instructions.
920///
921/// \param __a
922/// A 128-bit vector of [4 x float].
923/// \param __b
924/// A 128-bit vector of [4 x float].
925/// \returns A 128-bit vector of [4 x float] containing the comparison results.
926static __inline__ __m128 __DEFAULT_FN_ATTRS
927_mm_cmpngt_ps(__m128 __a, __m128 __b)
928{
929 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
930}
931
932/// Compares two 32-bit float values in the low-order bits of both
933/// operands to determine if the value in the first operand is not greater
934/// than or equal to the corresponding value in the second operand.
935///
936/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
937/// low-order bits of a vector of [4 x float].
938/// If either value in a comparison is NaN, returns true.
939///
940/// \headerfile <x86intrin.h>
941///
942/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
943/// instructions.
944///
945/// \param __a
946/// A 128-bit vector of [4 x float] containing one of the operands. The lower
947/// 32 bits of this operand are used in the comparison.
948/// \param __b
949/// A 128-bit vector of [4 x float] containing one of the operands. The lower
950/// 32 bits of this operand are used in the comparison.
951/// \returns A 128-bit vector of [4 x float] containing the comparison results
952/// in the low-order bits.
953static __inline__ __m128 __DEFAULT_FN_ATTRS
954_mm_cmpnge_ss(__m128 __a, __m128 __b)
955{
956 return (__m128)__builtin_shufflevector((__v4sf)__a,
957 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
958 4, 1, 2, 3);
959}
960
961/// Compares each of the corresponding 32-bit float values of the
962/// 128-bit vectors of [4 x float] to determine if the values in the first
963/// operand are not greater than or equal to those in the second operand.
964///
965/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
966/// If either value in a comparison is NaN, returns true.
967///
968/// \headerfile <x86intrin.h>
969///
970/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
971/// instructions.
972///
973/// \param __a
974/// A 128-bit vector of [4 x float].
975/// \param __b
976/// A 128-bit vector of [4 x float].
977/// \returns A 128-bit vector of [4 x float] containing the comparison results.
978static __inline__ __m128 __DEFAULT_FN_ATTRS
979_mm_cmpnge_ps(__m128 __a, __m128 __b)
980{
981 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
982}
983
984/// Compares two 32-bit float values in the low-order bits of both
985/// operands to determine if the value in the first operand is ordered with
986/// respect to the corresponding value in the second operand.
987///
988/// A pair of floating-point values are ordered with respect to each
989/// other if neither value is a NaN. Each comparison returns 0x0 for false,
990/// 0xFFFFFFFF for true.
991///
992/// \headerfile <x86intrin.h>
993///
994/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
995/// instructions.
996///
997/// \param __a
998/// A 128-bit vector of [4 x float] containing one of the operands. The lower
999/// 32 bits of this operand are used in the comparison.
1000/// \param __b
1001/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1002/// 32 bits of this operand are used in the comparison.
1003/// \returns A 128-bit vector of [4 x float] containing the comparison results
1004/// in the low-order bits.
1005static __inline__ __m128 __DEFAULT_FN_ATTRS
1006_mm_cmpord_ss(__m128 __a, __m128 __b)
1007{
1008 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1009}
1010
1011/// Compares each of the corresponding 32-bit float values of the
1012/// 128-bit vectors of [4 x float] to determine if the values in the first
1013/// operand are ordered with respect to those in the second operand.
1014///
1015/// A pair of floating-point values are ordered with respect to each
1016/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1017/// 0xFFFFFFFF for true.
1018///
1019/// \headerfile <x86intrin.h>
1020///
1021/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1022/// instructions.
1023///
1024/// \param __a
1025/// A 128-bit vector of [4 x float].
1026/// \param __b
1027/// A 128-bit vector of [4 x float].
1028/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1029static __inline__ __m128 __DEFAULT_FN_ATTRS
1030_mm_cmpord_ps(__m128 __a, __m128 __b)
1031{
1032 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1033}
1034
1035/// Compares two 32-bit float values in the low-order bits of both
1036/// operands to determine if the value in the first operand is unordered
1037/// with respect to the corresponding value in the second operand.
1038///
1039/// A pair of double-precision values are unordered with respect to each
1040/// other if one or both values are NaN. Each comparison returns 0x0 for
1041/// false, 0xFFFFFFFF for true.
1042///
1043/// \headerfile <x86intrin.h>
1044///
1045/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1046/// instructions.
1047///
1048/// \param __a
1049/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1050/// 32 bits of this operand are used in the comparison.
1051/// \param __b
1052/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1053/// 32 bits of this operand are used in the comparison.
1054/// \returns A 128-bit vector of [4 x float] containing the comparison results
1055/// in the low-order bits.
1056static __inline__ __m128 __DEFAULT_FN_ATTRS
1057_mm_cmpunord_ss(__m128 __a, __m128 __b)
1058{
1059 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1060}
1061
1062/// Compares each of the corresponding 32-bit float values of the
1063/// 128-bit vectors of [4 x float] to determine if the values in the first
1064/// operand are unordered with respect to those in the second operand.
1065///
1066/// A pair of double-precision values are unordered with respect to each
1067/// other if one or both values are NaN. Each comparison returns 0x0 for
1068/// false, 0xFFFFFFFFFFFFFFFF for true.
1069///
1070/// \headerfile <x86intrin.h>
1071///
1072/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1073/// instructions.
1074///
1075/// \param __a
1076/// A 128-bit vector of [4 x float].
1077/// \param __b
1078/// A 128-bit vector of [4 x float].
1079/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1080static __inline__ __m128 __DEFAULT_FN_ATTRS
1081_mm_cmpunord_ps(__m128 __a, __m128 __b)
1082{
1083 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1084}
1085
1086/// Compares two 32-bit float values in the low-order bits of both
1087/// operands for equality.
1088///
1089/// The comparison returns 0 for false, 1 for true. If either value in a
1090/// comparison is NaN, returns 0.
1091///
1092/// \headerfile <x86intrin.h>
1093///
1094/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1095/// instructions.
1096///
1097/// \param __a
1098/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1099/// used in the comparison.
1100/// \param __b
1101/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1102/// used in the comparison.
1103/// \returns An integer containing the comparison results.
1104static __inline__ int __DEFAULT_FN_ATTRS
1105_mm_comieq_ss(__m128 __a, __m128 __b)
1106{
1107 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1108}
1109
1110/// Compares two 32-bit float values in the low-order bits of both
1111/// operands to determine if the first operand is less than the second
1112/// operand.
1113///
1114/// The comparison returns 0 for false, 1 for true. If either value in a
1115/// comparison is NaN, returns 0.
1116///
1117/// \headerfile <x86intrin.h>
1118///
1119/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1120/// instructions.
1121///
1122/// \param __a
1123/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1124/// used in the comparison.
1125/// \param __b
1126/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1127/// used in the comparison.
1128/// \returns An integer containing the comparison results.
1129static __inline__ int __DEFAULT_FN_ATTRS
1130_mm_comilt_ss(__m128 __a, __m128 __b)
1131{
1132 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1133}
1134
1135/// Compares two 32-bit float values in the low-order bits of both
1136/// operands to determine if the first operand is less than or equal to the
1137/// second operand.
1138///
1139/// The comparison returns 0 for false, 1 for true. If either value in a
1140/// comparison is NaN, returns 0.
1141///
1142/// \headerfile <x86intrin.h>
1143///
1144/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1145///
1146/// \param __a
1147/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1148/// used in the comparison.
1149/// \param __b
1150/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1151/// used in the comparison.
1152/// \returns An integer containing the comparison results.
1153static __inline__ int __DEFAULT_FN_ATTRS
1154_mm_comile_ss(__m128 __a, __m128 __b)
1155{
1156 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1157}
1158
1159/// Compares two 32-bit float values in the low-order bits of both
1160/// operands to determine if the first operand is greater than the second
1161/// operand.
1162///
1163/// The comparison returns 0 for false, 1 for true. If either value in a
1164/// comparison is NaN, returns 0.
1165///
1166/// \headerfile <x86intrin.h>
1167///
1168/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1169///
1170/// \param __a
1171/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1172/// used in the comparison.
1173/// \param __b
1174/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175/// used in the comparison.
1176/// \returns An integer containing the comparison results.
1177static __inline__ int __DEFAULT_FN_ATTRS
1178_mm_comigt_ss(__m128 __a, __m128 __b)
1179{
1180 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1181}
1182
1183/// Compares two 32-bit float values in the low-order bits of both
1184/// operands to determine if the first operand is greater than or equal to
1185/// the second operand.
1186///
1187/// The comparison returns 0 for false, 1 for true. If either value in a
1188/// comparison is NaN, returns 0.
1189///
1190/// \headerfile <x86intrin.h>
1191///
1192/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1193///
1194/// \param __a
1195/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1196/// used in the comparison.
1197/// \param __b
1198/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1199/// used in the comparison.
1200/// \returns An integer containing the comparison results.
1201static __inline__ int __DEFAULT_FN_ATTRS
1202_mm_comige_ss(__m128 __a, __m128 __b)
1203{
1204 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1205}
1206
1207/// Compares two 32-bit float values in the low-order bits of both
1208/// operands to determine if the first operand is not equal to the second
1209/// operand.
1210///
1211/// The comparison returns 0 for false, 1 for true. If either value in a
1212/// comparison is NaN, returns 1.
1213///
1214/// \headerfile <x86intrin.h>
1215///
1216/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1217///
1218/// \param __a
1219/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1220/// used in the comparison.
1221/// \param __b
1222/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1223/// used in the comparison.
1224/// \returns An integer containing the comparison results.
1225static __inline__ int __DEFAULT_FN_ATTRS
1226_mm_comineq_ss(__m128 __a, __m128 __b)
1227{
1228 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1229}
1230
1231/// Performs an unordered comparison of two 32-bit float values using
1232/// the low-order bits of both operands to determine equality.
1233///
1234/// The comparison returns 0 for false, 1 for true. If either value in a
1235/// comparison is NaN, returns 0.
1236///
1237/// \headerfile <x86intrin.h>
1238///
1239/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1240///
1241/// \param __a
1242/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1243/// used in the comparison.
1244/// \param __b
1245/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1246/// used in the comparison.
1247/// \returns An integer containing the comparison results.
1248static __inline__ int __DEFAULT_FN_ATTRS
1249_mm_ucomieq_ss(__m128 __a, __m128 __b)
1250{
1251 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1252}
1253
1254/// Performs an unordered comparison of two 32-bit float values using
1255/// the low-order bits of both operands to determine if the first operand is
1256/// less than the second operand.
1257///
1258/// The comparison returns 0 for false, 1 for true. If either value in a
1259/// comparison is NaN, returns 0.
1260///
1261/// \headerfile <x86intrin.h>
1262///
1263/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1264///
1265/// \param __a
1266/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1267/// used in the comparison.
1268/// \param __b
1269/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1270/// used in the comparison.
1271/// \returns An integer containing the comparison results.
1272static __inline__ int __DEFAULT_FN_ATTRS
1273_mm_ucomilt_ss(__m128 __a, __m128 __b)
1274{
1275 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1276}
1277
1278/// Performs an unordered comparison of two 32-bit float values using
1279/// the low-order bits of both operands to determine if the first operand is
1280/// less than or equal to the second operand.
1281///
1282/// The comparison returns 0 for false, 1 for true. If either value in a
1283/// comparison is NaN, returns 0.
1284///
1285/// \headerfile <x86intrin.h>
1286///
1287/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1288///
1289/// \param __a
1290/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1291/// used in the comparison.
1292/// \param __b
1293/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1294/// used in the comparison.
1295/// \returns An integer containing the comparison results.
1296static __inline__ int __DEFAULT_FN_ATTRS
1297_mm_ucomile_ss(__m128 __a, __m128 __b)
1298{
1299 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1300}
1301
1302/// Performs an unordered comparison of two 32-bit float values using
1303/// the low-order bits of both operands to determine if the first operand is
1304/// greater than the second operand.
1305///
1306/// The comparison returns 0 for false, 1 for true. If either value in a
1307/// comparison is NaN, returns 0.
1308///
1309/// \headerfile <x86intrin.h>
1310///
1311/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1312///
1313/// \param __a
1314/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1315/// used in the comparison.
1316/// \param __b
1317/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1318/// used in the comparison.
1319/// \returns An integer containing the comparison results.
1320static __inline__ int __DEFAULT_FN_ATTRS
1321_mm_ucomigt_ss(__m128 __a, __m128 __b)
1322{
1323 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1324}
1325
1326/// Performs an unordered comparison of two 32-bit float values using
1327/// the low-order bits of both operands to determine if the first operand is
1328/// greater than or equal to the second operand.
1329///
1330/// The comparison returns 0 for false, 1 for true. If either value in a
1331/// comparison is NaN, returns 0.
1332///
1333/// \headerfile <x86intrin.h>
1334///
1335/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1336///
1337/// \param __a
1338/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1339/// used in the comparison.
1340/// \param __b
1341/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1342/// used in the comparison.
1343/// \returns An integer containing the comparison results.
1344static __inline__ int __DEFAULT_FN_ATTRS
1345_mm_ucomige_ss(__m128 __a, __m128 __b)
1346{
1347 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1348}
1349
1350/// Performs an unordered comparison of two 32-bit float values using
1351/// the low-order bits of both operands to determine inequality.
1352///
1353/// The comparison returns 0 for false, 1 for true. If either value in a
1354/// comparison is NaN, returns 0.
1355///
1356/// \headerfile <x86intrin.h>
1357///
1358/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1359///
1360/// \param __a
1361/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1362/// used in the comparison.
1363/// \param __b
1364/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1365/// used in the comparison.
1366/// \returns An integer containing the comparison results.
1367static __inline__ int __DEFAULT_FN_ATTRS
1368_mm_ucomineq_ss(__m128 __a, __m128 __b)
1369{
1370 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1371}
1372
1373/// Converts a float value contained in the lower 32 bits of a vector of
1374/// [4 x float] into a 32-bit integer.
1375///
1376/// If the converted value does not fit in a 32-bit integer, raises a
1377/// floating-point invalid exception. If the exception is masked, returns
1378/// the most negative integer.
1379///
1380/// \headerfile <x86intrin.h>
1381///
1382/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1383/// instructions.
1384///
1385/// \param __a
1386/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1387/// used in the conversion.
1388/// \returns A 32-bit integer containing the converted value.
1389static __inline__ int __DEFAULT_FN_ATTRS
1391{
1392 return __builtin_ia32_cvtss2si((__v4sf)__a);
1393}
1394
1395/// Converts a float value contained in the lower 32 bits of a vector of
1396/// [4 x float] into a 32-bit integer.
1397///
1398/// If the converted value does not fit in a 32-bit integer, raises a
1399/// floating-point invalid exception. If the exception is masked, returns
1400/// the most negative integer.
1401///
1402/// \headerfile <x86intrin.h>
1403///
1404/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1405/// instructions.
1406///
1407/// \param __a
1408/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409/// used in the conversion.
1410/// \returns A 32-bit integer containing the converted value.
1411static __inline__ int __DEFAULT_FN_ATTRS
1413{
1414 return _mm_cvtss_si32(__a);
1415}
1416
1417#ifdef __x86_64__
1418
1419/// Converts a float value contained in the lower 32 bits of a vector of
1420/// [4 x float] into a 64-bit integer.
1421///
1422/// If the converted value does not fit in a 32-bit integer, raises a
1423/// floating-point invalid exception. If the exception is masked, returns
1424/// the most negative integer.
1425///
1426/// \headerfile <x86intrin.h>
1427///
1428/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1429/// instructions.
1430///
1431/// \param __a
1432/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1433/// used in the conversion.
1434/// \returns A 64-bit integer containing the converted value.
1435static __inline__ long long __DEFAULT_FN_ATTRS
1436_mm_cvtss_si64(__m128 __a)
1437{
1438 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1439}
1440
1441#endif
1442
1443/// Converts two low-order float values in a 128-bit vector of
1444/// [4 x float] into a 64-bit vector of [2 x i32].
1445///
1446/// If a converted value does not fit in a 32-bit integer, raises a
1447/// floating-point invalid exception. If the exception is masked, returns
1448/// the most negative integer.
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1453///
1454/// \param __a
1455/// A 128-bit vector of [4 x float].
1456/// \returns A 64-bit integer vector containing the converted values.
1457static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1459{
1460 return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1461}
1462
1463/// Converts two low-order float values in a 128-bit vector of
1464/// [4 x float] into a 64-bit vector of [2 x i32].
1465///
1466/// If a converted value does not fit in a 32-bit integer, raises a
1467/// floating-point invalid exception. If the exception is masked, returns
1468/// the most negative integer.
1469///
1470/// \headerfile <x86intrin.h>
1471///
1472/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1473///
1474/// \param __a
1475/// A 128-bit vector of [4 x float].
1476/// \returns A 64-bit integer vector containing the converted values.
1477static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1479{
1480 return _mm_cvtps_pi32(__a);
1481}
1482
1483/// Converts the lower (first) element of a vector of [4 x float] into a signed
1484/// truncated (rounded toward zero) 32-bit integer.
1485///
1486/// If the converted value does not fit in a 32-bit integer, raises a
1487/// floating-point invalid exception. If the exception is masked, returns
1488/// the most negative integer.
1489///
1490/// \headerfile <x86intrin.h>
1491///
1492/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1493/// instructions.
1494///
1495/// \param __a
1496/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1497/// used in the conversion.
1498/// \returns A 32-bit integer containing the converted value.
1499static __inline__ int __DEFAULT_FN_ATTRS
1501{
1502 return __builtin_ia32_cvttss2si((__v4sf)__a);
1503}
1504
1505/// Converts the lower (first) element of a vector of [4 x float] into a signed
1506/// truncated (rounded toward zero) 32-bit integer.
1507///
1508/// If the converted value does not fit in a 32-bit integer, raises a
1509/// floating-point invalid exception. If the exception is masked, returns
1510/// the most negative integer.
1511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1515/// instructions.
1516///
1517/// \param __a
1518/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1519/// used in the conversion.
1520/// \returns A 32-bit integer containing the converted value.
1521static __inline__ int __DEFAULT_FN_ATTRS
1523{
1524 return _mm_cvttss_si32(__a);
1525}
1526
1527#ifdef __x86_64__
1528/// Converts the lower (first) element of a vector of [4 x float] into a signed
1529/// truncated (rounded toward zero) 64-bit integer.
1530///
1531/// If the converted value does not fit in a 64-bit integer, raises a
1532/// floating-point invalid exception. If the exception is masked, returns
1533/// the most negative integer.
1534///
1535/// \headerfile <x86intrin.h>
1536///
1537/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1538/// instructions.
1539///
1540/// \param __a
1541/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1542/// used in the conversion.
1543/// \returns A 64-bit integer containing the converted value.
1544static __inline__ long long __DEFAULT_FN_ATTRS
1545_mm_cvttss_si64(__m128 __a)
1546{
1547 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1548}
1549#endif
1550
1551/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1552/// into two signed truncated (rounded toward zero) 32-bit integers,
1553/// returned in a 64-bit vector of [2 x i32].
1554///
1555/// If a converted value does not fit in a 32-bit integer, raises a
1556/// floating-point invalid exception. If the exception is masked, returns
1557/// the most negative integer.
1558///
1559/// \headerfile <x86intrin.h>
1560///
1561/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1562/// instructions.
1563///
1564/// \param __a
1565/// A 128-bit vector of [4 x float].
1566/// \returns A 64-bit integer vector containing the converted values.
1567static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1569{
1570 return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1571}
1572
1573/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1574/// into two signed truncated (rounded toward zero) 64-bit integers,
1575/// returned in a 64-bit vector of [2 x i32].
1576///
1577/// If a converted value does not fit in a 32-bit integer, raises a
1578/// floating-point invalid exception. If the exception is masked, returns
1579/// the most negative integer.
1580///
1581/// \headerfile <x86intrin.h>
1582///
1583/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1584///
1585/// \param __a
1586/// A 128-bit vector of [4 x float].
1587/// \returns A 64-bit integer vector containing the converted values.
1588static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1590{
1591 return _mm_cvttps_pi32(__a);
1592}
1593
1594/// Converts a 32-bit signed integer value into a floating point value
1595/// and writes it to the lower 32 bits of the destination. The remaining
1596/// higher order elements of the destination vector are copied from the
1597/// corresponding elements in the first operand.
1598///
1599/// \headerfile <x86intrin.h>
1600///
1601/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1602///
1603/// \param __a
1604/// A 128-bit vector of [4 x float].
1605/// \param __b
1606/// A 32-bit signed integer operand containing the value to be converted.
1607/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1608/// converted value of the second operand. The upper 96 bits are copied from
1609/// the upper 96 bits of the first operand.
1610static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1611 int __b) {
1612 __a[0] = __b;
1613 return __a;
1614}
1615
1616/// Converts a 32-bit signed integer value into a floating point value
1617/// and writes it to the lower 32 bits of the destination. The remaining
1618/// higher order elements of the destination are copied from the
1619/// corresponding elements in the first operand.
1620///
1621/// \headerfile <x86intrin.h>
1622///
1623/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1624///
1625/// \param __a
1626/// A 128-bit vector of [4 x float].
1627/// \param __b
1628/// A 32-bit signed integer operand containing the value to be converted.
1629/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1630/// converted value of the second operand. The upper 96 bits are copied from
1631/// the upper 96 bits of the first operand.
1632static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1633 int __b) {
1634 return _mm_cvtsi32_ss(__a, __b);
1635}
1636
1637#ifdef __x86_64__
1638
1639/// Converts a 64-bit signed integer value into a floating point value
1640/// and writes it to the lower 32 bits of the destination. The remaining
1641/// higher order elements of the destination are copied from the
1642/// corresponding elements in the first operand.
1643///
1644/// \headerfile <x86intrin.h>
1645///
1646/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1647///
1648/// \param __a
1649/// A 128-bit vector of [4 x float].
1650/// \param __b
1651/// A 64-bit signed integer operand containing the value to be converted.
1652/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1653/// converted value of the second operand. The upper 96 bits are copied from
1654/// the upper 96 bits of the first operand.
1655static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1656_mm_cvtsi64_ss(__m128 __a, long long __b) {
1657 __a[0] = __b;
1658 return __a;
1659}
1660
1661#endif
1662
1663/// Converts two elements of a 64-bit vector of [2 x i32] into two
1664/// floating point values and writes them to the lower 64-bits of the
1665/// destination. The remaining higher order elements of the destination are
1666/// copied from the corresponding elements in the first operand.
1667///
1668/// \headerfile <x86intrin.h>
1669///
1670/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1671///
1672/// \param __a
1673/// A 128-bit vector of [4 x float].
1674/// \param __b
1675/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1676/// and written to the corresponding low-order elements in the destination.
1677/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1678/// converted value of the second operand. The upper 64 bits are copied from
1679/// the upper 64 bits of the first operand.
1680static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1681_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1682{
1683 return (__m128)__builtin_shufflevector(
1684 (__v4sf)__a,
1685 __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1686 4, 5, 2, 3);
1687}
1688
1689/// Converts two elements of a 64-bit vector of [2 x i32] into two
1690/// floating point values and writes them to the lower 64-bits of the
1691/// destination. The remaining higher order elements of the destination are
1692/// copied from the corresponding elements in the first operand.
1693///
1694/// \headerfile <x86intrin.h>
1695///
1696/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1697///
1698/// \param __a
1699/// A 128-bit vector of [4 x float].
1700/// \param __b
1701/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1702/// and written to the corresponding low-order elements in the destination.
1703/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1704/// converted value from the second operand. The upper 64 bits are copied
1705/// from the upper 64 bits of the first operand.
1706static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1707_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1708{
1709 return _mm_cvtpi32_ps(__a, __b);
1710}
1711
1712/// Extracts a float value contained in the lower 32 bits of a vector of
1713/// [4 x float].
1714///
1715/// \headerfile <x86intrin.h>
1716///
1717/// This intrinsic has no corresponding instruction.
1718///
1719/// \param __a
1720/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1721/// used in the extraction.
1722/// \returns A 32-bit float containing the extracted value.
1723static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1725 return __a[0];
1726}
1727
1728/// Loads two packed float values from the address \a __p into the
1729/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1730/// are copied from the low-order bits of the first operand.
1731///
1732/// \headerfile <x86intrin.h>
1733///
1734/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1735///
1736/// \param __a
1737/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1738/// of the destination.
1739/// \param __p
1740/// A pointer to two packed float values. Bits [63:0] are written to bits
1741/// [127:64] of the destination.
1742/// \returns A 128-bit vector of [4 x float] containing the moved values.
1743static __inline__ __m128 __DEFAULT_FN_ATTRS
1744_mm_loadh_pi(__m128 __a, const __m64 *__p)
1745{
1746 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1747 struct __mm_loadh_pi_struct {
1748 __mm_loadh_pi_v2f32 __u;
1749 } __attribute__((__packed__, __may_alias__));
1750 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1751 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1752 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1753}
1754
1755/// Loads two packed float values from the address \a __p into the
1756/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1757/// are copied from the high-order bits of the first operand.
1758///
1759/// \headerfile <x86intrin.h>
1760///
1761/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1762///
1763/// \param __a
1764/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1765/// [127:64] of the destination.
1766/// \param __p
1767/// A pointer to two packed float values. Bits [63:0] are written to bits
1768/// [63:0] of the destination.
1769/// \returns A 128-bit vector of [4 x float] containing the moved values.
1770static __inline__ __m128 __DEFAULT_FN_ATTRS
1771_mm_loadl_pi(__m128 __a, const __m64 *__p)
1772{
1773 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1774 struct __mm_loadl_pi_struct {
1775 __mm_loadl_pi_v2f32 __u;
1776 } __attribute__((__packed__, __may_alias__));
1777 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1778 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1779 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1780}
1781
1782/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1783/// 32 bits of the vector are initialized with the single-precision
1784/// floating-point value loaded from a specified memory location. The upper
1785/// 96 bits are set to zero.
1786///
1787/// \headerfile <x86intrin.h>
1788///
1789/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1790///
1791/// \param __p
1792/// A pointer to a 32-bit memory location containing a single-precision
1793/// floating-point value.
1794/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1795/// lower 32 bits contain the value loaded from the memory location. The
1796/// upper 96 bits are set to zero.
1797static __inline__ __m128 __DEFAULT_FN_ATTRS
1798_mm_load_ss(const float *__p)
1799{
1800 struct __mm_load_ss_struct {
1801 float __u;
1802 } __attribute__((__packed__, __may_alias__));
1803 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1804 return __extension__ (__m128){ __u, 0, 0, 0 };
1805}
1806
1807/// Loads a 32-bit float value and duplicates it to all four vector
1808/// elements of a 128-bit vector of [4 x float].
1809///
1810/// \headerfile <x86intrin.h>
1811///
1812/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1813/// instruction.
1814///
1815/// \param __p
1816/// A pointer to a float value to be loaded and duplicated.
1817/// \returns A 128-bit vector of [4 x float] containing the loaded and
1818/// duplicated values.
1819static __inline__ __m128 __DEFAULT_FN_ATTRS
1820_mm_load1_ps(const float *__p)
1821{
1822 struct __mm_load1_ps_struct {
1823 float __u;
1824 } __attribute__((__packed__, __may_alias__));
1825 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1826 return __extension__ (__m128){ __u, __u, __u, __u };
1827}
1828
1829#define _mm_load_ps1(p) _mm_load1_ps(p)
1830
1831/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1832/// memory location.
1833///
1834/// \headerfile <x86intrin.h>
1835///
1836/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1837///
1838/// \param __p
1839/// A pointer to a 128-bit memory location. The address of the memory
1840/// location has to be 128-bit aligned.
1841/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1842static __inline__ __m128 __DEFAULT_FN_ATTRS
1843_mm_load_ps(const float *__p)
1844{
1845 return *(const __m128*)__p;
1846}
1847
1848/// Loads a 128-bit floating-point vector of [4 x float] from an
1849/// unaligned memory location.
1850///
1851/// \headerfile <x86intrin.h>
1852///
1853/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1854///
1855/// \param __p
1856/// A pointer to a 128-bit memory location. The address of the memory
1857/// location does not have to be aligned.
1858/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1859static __inline__ __m128 __DEFAULT_FN_ATTRS
1860_mm_loadu_ps(const float *__p)
1861{
1862 struct __loadu_ps {
1863 __m128_u __v;
1864 } __attribute__((__packed__, __may_alias__));
1865 return ((const struct __loadu_ps*)__p)->__v;
1866}
1867
1868/// Loads four packed float values, in reverse order, from an aligned
1869/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1870///
1871/// \headerfile <x86intrin.h>
1872///
1873/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1874/// instruction.
1875///
1876/// \param __p
1877/// A pointer to a 128-bit memory location. The address of the memory
1878/// location has to be 128-bit aligned.
1879/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1880/// in reverse order.
1881static __inline__ __m128 __DEFAULT_FN_ATTRS
1882_mm_loadr_ps(const float *__p)
1883{
1884 __m128 __a = _mm_load_ps(__p);
1885 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1886}
1887
1888/// Create a 128-bit vector of [4 x float] with undefined values.
1889///
1890/// \headerfile <x86intrin.h>
1891///
1892/// This intrinsic has no corresponding instruction.
1893///
1894/// \returns A 128-bit vector of [4 x float] containing undefined values.
1895static __inline__ __m128 __DEFAULT_FN_ATTRS
1897{
1898 return (__m128)__builtin_ia32_undef128();
1899}
1900
1901/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1902/// 32 bits of the vector are initialized with the specified single-precision
1903/// floating-point value. The upper 96 bits are set to zero.
1904///
1905/// \headerfile <x86intrin.h>
1906///
1907/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1908///
1909/// \param __w
1910/// A single-precision floating-point value used to initialize the lower 32
1911/// bits of the result.
1912/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1913/// lower 32 bits contain the value provided in the source operand. The
1914/// upper 96 bits are set to zero.
1915static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1916_mm_set_ss(float __w) {
1917 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1918}
1919
1920/// Constructs a 128-bit floating-point vector of [4 x float], with each
1921/// of the four single-precision floating-point vector elements set to the
1922/// specified single-precision floating-point value.
1923///
1924/// \headerfile <x86intrin.h>
1925///
1926/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1927///
1928/// \param __w
1929/// A single-precision floating-point value used to initialize each vector
1930/// element of the result.
1931/// \returns An initialized 128-bit floating-point vector of [4 x float].
1932static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1933_mm_set1_ps(float __w) {
1934 return __extension__ (__m128){ __w, __w, __w, __w };
1935}
1936
1937/* Microsoft specific. */
1938/// Constructs a 128-bit floating-point vector of [4 x float], with each
1939/// of the four single-precision floating-point vector elements set to the
1940/// specified single-precision floating-point value.
1941///
1942/// \headerfile <x86intrin.h>
1943///
1944/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1945///
1946/// \param __w
1947/// A single-precision floating-point value used to initialize each vector
1948/// element of the result.
1949/// \returns An initialized 128-bit floating-point vector of [4 x float].
1950static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1951_mm_set_ps1(float __w) {
1952 return _mm_set1_ps(__w);
1953}
1954
1955/// Constructs a 128-bit floating-point vector of [4 x float]
1956/// initialized with the specified single-precision floating-point values.
1957///
1958/// \headerfile <x86intrin.h>
1959///
1960/// This intrinsic is a utility function and does not correspond to a specific
1961/// instruction.
1962///
1963/// \param __z
1964/// A single-precision floating-point value used to initialize bits [127:96]
1965/// of the result.
1966/// \param __y
1967/// A single-precision floating-point value used to initialize bits [95:64]
1968/// of the result.
1969/// \param __x
1970/// A single-precision floating-point value used to initialize bits [63:32]
1971/// of the result.
1972/// \param __w
1973/// A single-precision floating-point value used to initialize bits [31:0]
1974/// of the result.
1975/// \returns An initialized 128-bit floating-point vector of [4 x float].
1976static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1977_mm_set_ps(float __z, float __y, float __x, float __w) {
1978 return __extension__ (__m128){ __w, __x, __y, __z };
1979}
1980
1981/// Constructs a 128-bit floating-point vector of [4 x float],
1982/// initialized in reverse order with the specified 32-bit single-precision
1983/// float-point values.
1984///
1985/// \headerfile <x86intrin.h>
1986///
1987/// This intrinsic is a utility function and does not correspond to a specific
1988/// instruction.
1989///
1990/// \param __z
1991/// A single-precision floating-point value used to initialize bits [31:0]
1992/// of the result.
1993/// \param __y
1994/// A single-precision floating-point value used to initialize bits [63:32]
1995/// of the result.
1996/// \param __x
1997/// A single-precision floating-point value used to initialize bits [95:64]
1998/// of the result.
1999/// \param __w
2000/// A single-precision floating-point value used to initialize bits [127:96]
2001/// of the result.
2002/// \returns An initialized 128-bit floating-point vector of [4 x float].
2003static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2004_mm_setr_ps(float __z, float __y, float __x, float __w) {
2005 return __extension__ (__m128){ __z, __y, __x, __w };
2006}
2007
2008/// Constructs a 128-bit floating-point vector of [4 x float] initialized
2009/// to zero.
2010///
2011/// \headerfile <x86intrin.h>
2012///
2013/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2014///
2015/// \returns An initialized 128-bit floating-point vector of [4 x float] with
2016/// all elements set to zero.
2017static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2019 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2020}
2021
2022/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2023/// memory location.
2024///
2025/// \headerfile <x86intrin.h>
2026///
2027/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2028///
2029/// \param __p
2030/// A pointer to a 64-bit memory location.
2031/// \param __a
2032/// A 128-bit vector of [4 x float] containing the values to be stored.
2033static __inline__ void __DEFAULT_FN_ATTRS
2034_mm_storeh_pi(__m64 *__p, __m128 __a)
2035{
2036 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2037 struct __mm_storeh_pi_struct {
2038 __mm_storeh_pi_v2f32 __u;
2039 } __attribute__((__packed__, __may_alias__));
2040 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2041}
2042
2043/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2044/// memory location.
2045///
2046/// \headerfile <x86intrin.h>
2047///
2048/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2049///
2050/// \param __p
2051/// A pointer to a memory location that will receive the float values.
2052/// \param __a
2053/// A 128-bit vector of [4 x float] containing the values to be stored.
2054static __inline__ void __DEFAULT_FN_ATTRS
2055_mm_storel_pi(__m64 *__p, __m128 __a)
2056{
2057 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2058 struct __mm_storeh_pi_struct {
2059 __mm_storeh_pi_v2f32 __u;
2060 } __attribute__((__packed__, __may_alias__));
2061 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2062}
2063
2064/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2065/// memory location.
2066///
2067/// \headerfile <x86intrin.h>
2068///
2069/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2070///
2071/// \param __p
2072/// A pointer to a 32-bit memory location.
2073/// \param __a
2074/// A 128-bit vector of [4 x float] containing the value to be stored.
2075static __inline__ void __DEFAULT_FN_ATTRS
2076_mm_store_ss(float *__p, __m128 __a)
2077{
2078 struct __mm_store_ss_struct {
2079 float __u;
2080 } __attribute__((__packed__, __may_alias__));
2081 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2082}
2083
2084/// Stores a 128-bit vector of [4 x float] to an unaligned memory
2085/// location.
2086///
2087/// \headerfile <x86intrin.h>
2088///
2089/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2090///
2091/// \param __p
2092/// A pointer to a 128-bit memory location. The address of the memory
2093/// location does not have to be aligned.
2094/// \param __a
2095/// A 128-bit vector of [4 x float] containing the values to be stored.
2096static __inline__ void __DEFAULT_FN_ATTRS
2097_mm_storeu_ps(float *__p, __m128 __a)
2098{
2099 struct __storeu_ps {
2100 __m128_u __v;
2101 } __attribute__((__packed__, __may_alias__));
2102 ((struct __storeu_ps*)__p)->__v = __a;
2103}
2104
2105/// Stores a 128-bit vector of [4 x float] into an aligned memory
2106/// location.
2107///
2108/// \headerfile <x86intrin.h>
2109///
2110/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2111///
2112/// \param __p
2113/// A pointer to a 128-bit memory location. The address of the memory
2114/// location has to be 16-byte aligned.
2115/// \param __a
2116/// A 128-bit vector of [4 x float] containing the values to be stored.
2117static __inline__ void __DEFAULT_FN_ATTRS
2118_mm_store_ps(float *__p, __m128 __a)
2119{
2120 *(__m128*)__p = __a;
2121}
2122
2123/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2124/// four contiguous elements in an aligned memory location.
2125///
2126/// \headerfile <x86intrin.h>
2127///
2128/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2129/// instruction.
2130///
2131/// \param __p
2132/// A pointer to a 128-bit memory location.
2133/// \param __a
2134/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2135/// of the four contiguous elements pointed by \a __p.
2136static __inline__ void __DEFAULT_FN_ATTRS
2137_mm_store1_ps(float *__p, __m128 __a)
2138{
2139 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2141}
2142
2143/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2144/// four contiguous elements in an aligned memory location.
2145///
2146/// \headerfile <x86intrin.h>
2147///
2148/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2149/// instruction.
2150///
2151/// \param __p
2152/// A pointer to a 128-bit memory location.
2153/// \param __a
2154/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2155/// of the four contiguous elements pointed by \a __p.
2156static __inline__ void __DEFAULT_FN_ATTRS
2157_mm_store_ps1(float *__p, __m128 __a)
2158{
2160}
2161
2162/// Stores float values from a 128-bit vector of [4 x float] to an
2163/// aligned memory location in reverse order.
2164///
2165/// \headerfile <x86intrin.h>
2166///
2167/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2168/// instruction.
2169///
2170/// \param __p
2171/// A pointer to a 128-bit memory location. The address of the memory
2172/// location has to be 128-bit aligned.
2173/// \param __a
2174/// A 128-bit vector of [4 x float] containing the values to be stored.
2175static __inline__ void __DEFAULT_FN_ATTRS
2176_mm_storer_ps(float *__p, __m128 __a)
2177{
2178 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2180}
2181
2182#define _MM_HINT_ET0 7
2183#define _MM_HINT_ET1 6
2184#define _MM_HINT_T0 3
2185#define _MM_HINT_T1 2
2186#define _MM_HINT_T2 1
2187#define _MM_HINT_NTA 0
2188
2189#ifndef _MSC_VER
2190// If _MSC_VER is defined, we use the builtin variant of _mm_prefetch.
2191// Otherwise, we provide this macro, which includes a cast, allowing the user
2192// to pass a pointer of any time. The _mm_prefetch accepts char to match MSVC.
2193
2194/// Loads one cache line of data from the specified address to a location
2195/// closer to the processor.
2196///
2197/// \headerfile <x86intrin.h>
2198///
2199/// \code
2200/// void _mm_prefetch(const void *a, const int sel);
2201/// \endcode
2202///
2203/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2204///
2205/// \param a
2206/// A pointer to a memory location containing a cache line of data.
2207/// \param sel
2208/// A predefined integer constant specifying the type of prefetch
2209/// operation: \n
2210/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2211/// PREFETCHNTA instruction will be generated. \n
2212/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2213/// be generated. \n
2214/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2215/// be generated. \n
2216/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2217/// be generated.
2218#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2219 ((sel) >> 2) & 1, (sel) & 0x3))
2220#endif
2221
2222/// Stores a 64-bit integer in the specified aligned memory location. To
2223/// minimize caching, the data is flagged as non-temporal (unlikely to be
2224/// used again soon).
2225///
2226/// \headerfile <x86intrin.h>
2227///
2228/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2229///
2230/// \param __p
2231/// A pointer to an aligned memory location used to store the register value.
2232/// \param __a
2233/// A 64-bit integer containing the value to be stored.
2234static __inline__ void __DEFAULT_FN_ATTRS
2235_mm_stream_pi(void *__p, __m64 __a)
2236{
2237 __builtin_nontemporal_store(__a, (__m64 *)__p);
2238}
2239
2240/// Moves packed float values from a 128-bit vector of [4 x float] to a
2241/// 128-bit aligned memory location. To minimize caching, the data is flagged
2242/// as non-temporal (unlikely to be used again soon).
2243///
2244/// \headerfile <x86intrin.h>
2245///
2246/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2247///
2248/// \param __p
2249/// A pointer to a 128-bit aligned memory location that will receive the
2250/// single-precision floating-point values.
2251/// \param __a
2252/// A 128-bit vector of [4 x float] containing the values to be moved.
2253static __inline__ void __DEFAULT_FN_ATTRS
2254_mm_stream_ps(void *__p, __m128 __a)
2255{
2256 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2257}
2258
2259#if defined(__cplusplus)
2260extern "C" {
2261#endif
2262
2263/// Forces strong memory ordering (serialization) between store
2264/// instructions preceding this instruction and store instructions following
2265/// this instruction, ensuring the system completes all previous stores
2266/// before executing subsequent stores.
2267///
2268/// \headerfile <x86intrin.h>
2269///
2270/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2271///
2272void _mm_sfence(void);
2273
2274#if defined(__cplusplus)
2275} // extern "C"
2276#endif
2277
2278/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2279/// returns it, as specified by the immediate integer operand.
2280///
2281/// \headerfile <x86intrin.h>
2282///
2283/// \code
2284/// int _mm_extract_pi16(__m64 a, int n);
2285/// \endcode
2286///
2287/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2288///
2289/// \param a
2290/// A 64-bit vector of [4 x i16].
2291/// \param n
2292/// An immediate integer operand that determines which bits are extracted: \n
2293/// 0: Bits [15:0] are copied to the destination. \n
2294/// 1: Bits [31:16] are copied to the destination. \n
2295/// 2: Bits [47:32] are copied to the destination. \n
2296/// 3: Bits [63:48] are copied to the destination.
2297/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2298#define _mm_extract_pi16(a, n) \
2299 ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2300
2301/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2302/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2303/// specified by the immediate operand \a n.
2304///
2305/// \headerfile <x86intrin.h>
2306///
2307/// \code
2308/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2309/// \endcode
2310///
2311/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2312///
2313/// \param a
2314/// A 64-bit vector of [4 x i16].
2315/// \param d
2316/// An integer. The lower 16-bit value from this operand is written to the
2317/// destination at the offset specified by operand \a n.
2318/// \param n
2319/// An immediate integer operant that determines which the bits to be used
2320/// in the destination. \n
2321/// 0: Bits [15:0] are copied to the destination. \n
2322/// 1: Bits [31:16] are copied to the destination. \n
2323/// 2: Bits [47:32] are copied to the destination. \n
2324/// 3: Bits [63:48] are copied to the destination. \n
2325/// The remaining bits in the destination are copied from the corresponding
2326/// bits in operand \a a.
2327/// \returns A 64-bit integer vector containing the copied packed data from the
2328/// operands.
2329#define _mm_insert_pi16(a, d, n) \
2330 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2331
2332/// Compares each of the corresponding packed 16-bit integer values of
2333/// the 64-bit integer vectors, and writes the greater value to the
2334/// corresponding bits in the destination.
2335///
2336/// \headerfile <x86intrin.h>
2337///
2338/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2339///
2340/// \param __a
2341/// A 64-bit integer vector containing one of the source operands.
2342/// \param __b
2343/// A 64-bit integer vector containing one of the source operands.
2344/// \returns A 64-bit integer vector containing the comparison results.
2345static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2346_mm_max_pi16(__m64 __a, __m64 __b) {
2347 return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2348}
2349
2350/// Compares each of the corresponding packed 8-bit unsigned integer
2351/// values of the 64-bit integer vectors, and writes the greater value to the
2352/// corresponding bits in the destination.
2353///
2354/// \headerfile <x86intrin.h>
2355///
2356/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2357///
2358/// \param __a
2359/// A 64-bit integer vector containing one of the source operands.
2360/// \param __b
2361/// A 64-bit integer vector containing one of the source operands.
2362/// \returns A 64-bit integer vector containing the comparison results.
2363static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2364_mm_max_pu8(__m64 __a, __m64 __b) {
2365 return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2366}
2367
2368/// Compares each of the corresponding packed 16-bit integer values of
2369/// the 64-bit integer vectors, and writes the lesser value to the
2370/// corresponding bits in the destination.
2371///
2372/// \headerfile <x86intrin.h>
2373///
2374/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2375///
2376/// \param __a
2377/// A 64-bit integer vector containing one of the source operands.
2378/// \param __b
2379/// A 64-bit integer vector containing one of the source operands.
2380/// \returns A 64-bit integer vector containing the comparison results.
2381static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2382_mm_min_pi16(__m64 __a, __m64 __b) {
2383 return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2384}
2385
2386/// Compares each of the corresponding packed 8-bit unsigned integer
2387/// values of the 64-bit integer vectors, and writes the lesser value to the
2388/// corresponding bits in the destination.
2389///
2390/// \headerfile <x86intrin.h>
2391///
2392/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2393///
2394/// \param __a
2395/// A 64-bit integer vector containing one of the source operands.
2396/// \param __b
2397/// A 64-bit integer vector containing one of the source operands.
2398/// \returns A 64-bit integer vector containing the comparison results.
2399static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2400_mm_min_pu8(__m64 __a, __m64 __b) {
2401 return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2402}
2403
2404/// Takes the most significant bit from each 8-bit element in a 64-bit
2405/// integer vector to create an 8-bit mask value. Zero-extends the value to
2406/// 32-bit integer and writes it to the destination.
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2411///
2412/// \param __a
2413/// A 64-bit integer vector containing the values with bits to be extracted.
2414/// \returns The most significant bit from each 8-bit element in \a __a,
2415/// written to bits [7:0].
2416static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2418 return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2419}
2420
2421/// Multiplies packed 16-bit unsigned integer values and writes the
2422/// high-order 16 bits of each 32-bit product to the corresponding bits in
2423/// the destination.
2424///
2425/// \headerfile <x86intrin.h>
2426///
2427/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2428///
2429/// \param __a
2430/// A 64-bit integer vector containing one of the source operands.
2431/// \param __b
2432/// A 64-bit integer vector containing one of the source operands.
2433/// \returns A 64-bit integer vector containing the products of both operands.
2434static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2436{
2437 return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a),
2438 (__v8hu)__zext128(__b)));
2439}
2440
2441/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2442/// destination, as specified by the immediate value operand.
2443///
2444/// \headerfile <x86intrin.h>
2445///
2446/// \code
2447/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2448/// \endcode
2449///
2450/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2451///
2452/// \param a
2453/// A 64-bit integer vector containing the values to be shuffled.
2454/// \param n
2455/// An immediate value containing an 8-bit value specifying which elements to
2456/// copy from \a a. The destinations within the 64-bit destination are
2457/// assigned values as follows: \n
2458/// Bits [1:0] are used to assign values to bits [15:0] in the
2459/// destination. \n
2460/// Bits [3:2] are used to assign values to bits [31:16] in the
2461/// destination. \n
2462/// Bits [5:4] are used to assign values to bits [47:32] in the
2463/// destination. \n
2464/// Bits [7:6] are used to assign values to bits [63:48] in the
2465/// destination. \n
2466/// Bit value assignments: \n
2467/// 00: assigned from bits [15:0] of \a a. \n
2468/// 01: assigned from bits [31:16] of \a a. \n
2469/// 10: assigned from bits [47:32] of \a a. \n
2470/// 11: assigned from bits [63:48] of \a a. \n
2471/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2472/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2473/// <c>[b6, b4, b2, b0]</c>.
2474/// \returns A 64-bit integer vector containing the shuffled values.
2475#define _mm_shuffle_pi16(a, n) \
2476 ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2477 (n) & 0x3, ((n) >> 2) & 0x3, \
2478 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2479
2480/// Conditionally copies the values from each 8-bit element in the first
2481/// 64-bit integer vector operand to the specified memory location, as
2482/// specified by the most significant bit in the corresponding element in the
2483/// second 64-bit integer vector operand.
2484///
2485/// To minimize caching, the data is flagged as non-temporal
2486/// (unlikely to be used again soon).
2487///
2488/// \headerfile <x86intrin.h>
2489///
2490/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2491///
2492/// \param __d
2493/// A 64-bit integer vector containing the values with elements to be copied.
2494/// \param __n
2495/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2496/// element determines whether the corresponding element in operand \a __d
2497/// is copied. If the most significant bit of a given element is 1, the
2498/// corresponding element in operand \a __d is copied.
2499/// \param __p
2500/// A pointer to a 64-bit memory location that will receive the conditionally
2501/// copied integer values. The address of the memory location does not have
2502/// to be aligned.
2503static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2504_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2505{
2506 // This is complex, because we need to support the case where __p is pointing
2507 // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2508 // write might cause a trap where a 64-bit maskmovq would not. (Memory
2509 // locations not selected by the mask bits might still cause traps.)
2510 __m128i __d128 = __anyext128(__d);
2511 __m128i __n128 = __zext128(__n);
2512 if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2513 ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2514 // If there's a risk of spurious trap due to a 128-bit write, back up the
2515 // pointer by 8 bytes and shift values in registers to match.
2516 __p -= 8;
2517 __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
2518 __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
2519 }
2520
2521 __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2522}
2523
2524/// Computes the rounded averages of the packed unsigned 8-bit integer
2525/// values and writes the averages to the corresponding bits in the
2526/// destination.
2527///
2528/// \headerfile <x86intrin.h>
2529///
2530/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2531///
2532/// \param __a
2533/// A 64-bit integer vector containing one of the source operands.
2534/// \param __b
2535/// A 64-bit integer vector containing one of the source operands.
2536/// \returns A 64-bit integer vector containing the averages of both operands.
2537static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2538_mm_avg_pu8(__m64 __a, __m64 __b) {
2539 return __trunc64(__builtin_ia32_pavgb128((__v16qu)__zext128(__a),
2540 (__v16qu)__zext128(__b)));
2541}
2542
2543/// Computes the rounded averages of the packed unsigned 16-bit integer
2544/// values and writes the averages to the corresponding bits in the
2545/// destination.
2546///
2547/// \headerfile <x86intrin.h>
2548///
2549/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2550///
2551/// \param __a
2552/// A 64-bit integer vector containing one of the source operands.
2553/// \param __b
2554/// A 64-bit integer vector containing one of the source operands.
2555/// \returns A 64-bit integer vector containing the averages of both operands.
2556static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2557_mm_avg_pu16(__m64 __a, __m64 __b) {
2558 return __trunc64(
2559 __builtin_ia32_pavgw128((__v8hu)__zext128(__a), (__v8hu)__zext128(__b)));
2560}
2561
2562/// Subtracts the corresponding 8-bit unsigned integer values of the two
2563/// 64-bit vector operands and computes the absolute value for each of the
2564/// difference. Then sum of the 8 absolute differences is written to the
2565/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2566///
2567/// \headerfile <x86intrin.h>
2568///
2569/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2570///
2571/// \param __a
2572/// A 64-bit integer vector containing one of the source operands.
2573/// \param __b
2574/// A 64-bit integer vector containing one of the source operands.
2575/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2576/// sets of absolute differences between both operands. The upper bits are
2577/// cleared.
2578static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2579_mm_sad_pu8(__m64 __a, __m64 __b)
2580{
2581 return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2582 (__v16qi)__zext128(__b)));
2583}
2584
2585#if defined(__cplusplus)
2586extern "C" {
2587#endif
2588
2589/// Returns the contents of the MXCSR register as a 32-bit unsigned
2590/// integer value.
2591///
2592/// There are several groups of macros associated with this
2593/// intrinsic, including:
2594/// <ul>
2595/// <li>
2596/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2597/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2598/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2599/// _MM_GET_EXCEPTION_STATE().
2600/// </li>
2601/// <li>
2602/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2603/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2604/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2605/// </li>
2606/// <li>
2607/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2608/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2609/// _MM_GET_ROUNDING_MODE().
2610/// </li>
2611/// <li>
2612/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2613/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2614/// </li>
2615/// <li>
2616/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2617/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2618/// _MM_GET_DENORMALS_ZERO_MODE().
2619/// </li>
2620/// </ul>
2621///
2622/// For example, the following expression checks if an overflow exception has
2623/// occurred:
2624/// \code
2625/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2626/// \endcode
2627///
2628/// The following expression gets the current rounding mode:
2629/// \code
2630/// _MM_GET_ROUNDING_MODE()
2631/// \endcode
2632///
2633/// \headerfile <x86intrin.h>
2634///
2635/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2636///
2637/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2638/// register.
2639unsigned int _mm_getcsr(void);
2640
2641/// Sets the MXCSR register with the 32-bit unsigned integer value.
2642///
2643/// There are several groups of macros associated with this intrinsic,
2644/// including:
2645/// <ul>
2646/// <li>
2647/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2648/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2649/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2650/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2651/// </li>
2652/// <li>
2653/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2654/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2655/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2656/// of these macros.
2657/// </li>
2658/// <li>
2659/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2660/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2661/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2662/// </li>
2663/// <li>
2664/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2665/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2666/// one of these macros.
2667/// </li>
2668/// <li>
2669/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2670/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2671/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2672/// </li>
2673/// </ul>
2674///
2675/// For example, the following expression causes subsequent floating-point
2676/// operations to round up:
2677/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2678///
2679/// The following example sets the DAZ and FTZ flags:
2680/// \code
2681/// void setFlags() {
2682/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2683/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2684/// }
2685/// \endcode
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2690///
2691/// \param __i
2692/// A 32-bit unsigned integer value to be written to the MXCSR register.
2693void _mm_setcsr(unsigned int __i);
2694
2695#if defined(__cplusplus)
2696} // extern "C"
2697#endif
2698
2699/// Selects 4 float values from the 128-bit operands of [4 x float], as
2700/// specified by the immediate value operand.
2701///
2702/// \headerfile <x86intrin.h>
2703///
2704/// \code
2705/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2706/// \endcode
2707///
2708/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2709///
2710/// \param a
2711/// A 128-bit vector of [4 x float].
2712/// \param b
2713/// A 128-bit vector of [4 x float].
2714/// \param mask
2715/// An immediate value containing an 8-bit value specifying which elements to
2716/// copy from \a a and \a b. \n
2717/// Bits [3:0] specify the values copied from operand \a a. \n
2718/// Bits [7:4] specify the values copied from operand \a b. \n
2719/// The destinations within the 128-bit destination are assigned values as
2720/// follows: \n
2721/// Bits [1:0] are used to assign values to bits [31:0] in the
2722/// destination. \n
2723/// Bits [3:2] are used to assign values to bits [63:32] in the
2724/// destination. \n
2725/// Bits [5:4] are used to assign values to bits [95:64] in the
2726/// destination. \n
2727/// Bits [7:6] are used to assign values to bits [127:96] in the
2728/// destination. \n
2729/// Bit value assignments: \n
2730/// 00: Bits [31:0] copied from the specified operand. \n
2731/// 01: Bits [63:32] copied from the specified operand. \n
2732/// 10: Bits [95:64] copied from the specified operand. \n
2733/// 11: Bits [127:96] copied from the specified operand. \n
2734/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2735/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2736/// <c>[b6, b4, b2, b0]</c>.
2737/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2738#define _mm_shuffle_ps(a, b, mask) \
2739 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2740 (int)(mask)))
2741
2742/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2743/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2744///
2745/// \headerfile <x86intrin.h>
2746///
2747/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2748///
2749/// \param __a
2750/// A 128-bit vector of [4 x float]. \n
2751/// Bits [95:64] are written to bits [31:0] of the destination. \n
2752/// Bits [127:96] are written to bits [95:64] of the destination.
2753/// \param __b
2754/// A 128-bit vector of [4 x float].
2755/// Bits [95:64] are written to bits [63:32] of the destination. \n
2756/// Bits [127:96] are written to bits [127:96] of the destination.
2757/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2758static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2759_mm_unpackhi_ps(__m128 __a, __m128 __b) {
2760 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2761}
2762
2763/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2764/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2765///
2766/// \headerfile <x86intrin.h>
2767///
2768/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2769///
2770/// \param __a
2771/// A 128-bit vector of [4 x float]. \n
2772/// Bits [31:0] are written to bits [31:0] of the destination. \n
2773/// Bits [63:32] are written to bits [95:64] of the destination.
2774/// \param __b
2775/// A 128-bit vector of [4 x float]. \n
2776/// Bits [31:0] are written to bits [63:32] of the destination. \n
2777/// Bits [63:32] are written to bits [127:96] of the destination.
2778/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2779static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2780_mm_unpacklo_ps(__m128 __a, __m128 __b) {
2781 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2782}
2783
2784/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2785/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2786/// 96 bits are set to the upper 96 bits of the first parameter.
2787///
2788/// \headerfile <x86intrin.h>
2789///
2790/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2791/// instruction.
2792///
2793/// \param __a
2794/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2795/// written to the upper 96 bits of the result.
2796/// \param __b
2797/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2798/// written to the lower 32 bits of the result.
2799/// \returns A 128-bit floating-point vector of [4 x float].
2800static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2801_mm_move_ss(__m128 __a, __m128 __b) {
2802 __a[0] = __b[0];
2803 return __a;
2804}
2805
2806/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2807/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2808/// 64 bits are set to the upper 64 bits of the first parameter.
2809///
2810/// \headerfile <x86intrin.h>
2811///
2812/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2813///
2814/// \param __a
2815/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2816/// written to the upper 64 bits of the result.
2817/// \param __b
2818/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2819/// written to the lower 64 bits of the result.
2820/// \returns A 128-bit floating-point vector of [4 x float].
2821static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2822_mm_movehl_ps(__m128 __a, __m128 __b) {
2823 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2824}
2825
2826/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2827/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2828/// 64 bits are set to the lower 64 bits of the second parameter.
2829///
2830/// \headerfile <x86intrin.h>
2831///
2832/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2833///
2834/// \param __a
2835/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2836/// written to the lower 64 bits of the result.
2837/// \param __b
2838/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2839/// written to the upper 64 bits of the result.
2840/// \returns A 128-bit floating-point vector of [4 x float].
2841static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2842_mm_movelh_ps(__m128 __a, __m128 __b) {
2843 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2844}
2845
2846/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2847/// float].
2848///
2849/// \headerfile <x86intrin.h>
2850///
2851/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2852///
2853/// \param __a
2854/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2855/// from the corresponding elements in this operand.
2856/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2857/// values from the operand.
2858static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2860{
2861 return __builtin_convertvector((__v4hi)__a, __v4sf);
2862}
2863
2864/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2865/// 128-bit vector of [4 x float].
2866///
2867/// \headerfile <x86intrin.h>
2868///
2869/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2870///
2871/// \param __a
2872/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2873/// destination are copied from the corresponding elements in this operand.
2874/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2875/// values from the operand.
2876static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2878{
2879 return __builtin_convertvector((__v4hu)__a, __v4sf);
2880}
2881
2882/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2883/// into a 128-bit vector of [4 x float].
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2888///
2889/// \param __a
2890/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2891/// from the corresponding lower 4 elements in this operand.
2892/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2893/// values from the operand.
2894static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2896{
2897 return __builtin_convertvector(
2898 __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2899 0, 1, 2, 3), __v4sf);
2900}
2901
2902/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2903/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2904///
2905/// \headerfile <x86intrin.h>
2906///
2907/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2908///
2909/// \param __a
2910/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2911/// destination are copied from the corresponding lower 4 elements in this
2912/// operand.
2913/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2914/// values from the source operand.
2915static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2917{
2918 return __builtin_convertvector(
2919 __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2920 0, 1, 2, 3), __v4sf);
2921}
2922
2923/// Converts the two 32-bit signed integer values from each 64-bit vector
2924/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2925///
2926/// \headerfile <x86intrin.h>
2927///
2928/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2929///
2930/// \param __a
2931/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2932/// copied from the elements in this operand.
2933/// \param __b
2934/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2935/// copied from the elements in this operand.
2936/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2937/// copied and converted values from the first operand. The upper 64 bits
2938/// contain the copied and converted values from the second operand.
2939static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2941{
2942 return __builtin_convertvector(
2943 __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2944 0, 1, 2, 3), __v4sf);
2945}
2946
2947/// Converts each single-precision floating-point element of a 128-bit
2948/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2949/// packs the results into a 64-bit integer vector of [4 x i16].
2950///
2951/// If the floating-point element is NaN or infinity, or if the
2952/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2953/// it is converted to 0x8000. Otherwise if the floating-point element is
2954/// greater than 0x7FFF, it is converted to 0x7FFF.
2955///
2956/// \headerfile <x86intrin.h>
2957///
2958/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2959///
2960/// \param __a
2961/// A 128-bit floating-point vector of [4 x float].
2962/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2963/// values.
2964static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2966{
2967 return __trunc64(__builtin_ia32_packssdw128(
2968 (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2969}
2970
2971/// Converts each single-precision floating-point element of a 128-bit
2972/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2973/// packs the results into the lower 32 bits of a 64-bit integer vector of
2974/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2975///
2976/// If the floating-point element is NaN or infinity, or if the
2977/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2978/// is converted to 0x80. Otherwise if the floating-point element is greater
2979/// than 0x7F, it is converted to 0x7F.
2980///
2981/// \headerfile <x86intrin.h>
2982///
2983/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2984///
2985/// \param __a
2986/// 128-bit floating-point vector of [4 x float].
2987/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2988/// converted values and the uppper 32 bits are set to zero.
2989static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2991{
2992 __m64 __b, __c;
2993
2996
2997 return _mm_packs_pi16(__b, __c);
2998}
2999
3000/// Extracts the sign bits from each single-precision floating-point
3001/// element of a 128-bit floating-point vector of [4 x float] and returns the
3002/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3003/// to zero.
3004///
3005/// \headerfile <x86intrin.h>
3006///
3007/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3008///
3009/// \param __a
3010/// A 128-bit floating-point vector of [4 x float].
3011/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3012/// single-precision floating-point element of the parameter. Bits [31:4] are
3013/// set to zero.
3014static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a) {
3015 return __builtin_ia32_movmskps((__v4sf)__a);
3016}
3017
3018/* Compare */
3019#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3020#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3021#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3022#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3023#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3024#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3025#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3026#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3027
3028/// Compares each of the corresponding values of two 128-bit vectors of
3029/// [4 x float], using the operation specified by the immediate integer
3030/// operand.
3031///
3032/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3033/// If either value in a comparison is NaN, comparisons that are ordered
3034/// return false, and comparisons that are unordered return true.
3035///
3036/// \headerfile <x86intrin.h>
3037///
3038/// \code
3039/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3040/// \endcode
3041///
3042/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3043///
3044/// \param a
3045/// A 128-bit vector of [4 x float].
3046/// \param b
3047/// A 128-bit vector of [4 x float].
3048/// \param c
3049/// An immediate integer operand, with bits [4:0] specifying which comparison
3050/// operation to use: \n
3051/// 0x00: Equal (ordered, non-signaling) \n
3052/// 0x01: Less-than (ordered, signaling) \n
3053/// 0x02: Less-than-or-equal (ordered, signaling) \n
3054/// 0x03: Unordered (non-signaling) \n
3055/// 0x04: Not-equal (unordered, non-signaling) \n
3056/// 0x05: Not-less-than (unordered, signaling) \n
3057/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3058/// 0x07: Ordered (non-signaling) \n
3059/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3060#define _mm_cmp_ps(a, b, c) \
3061 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3062
3063/// Compares each of the corresponding scalar values of two 128-bit
3064/// vectors of [4 x float], using the operation specified by the immediate
3065/// integer operand.
3066///
3067/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3068/// If either value in a comparison is NaN, comparisons that are ordered
3069/// return false, and comparisons that are unordered return true.
3070///
3071/// \headerfile <x86intrin.h>
3072///
3073/// \code
3074/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3075/// \endcode
3076///
3077/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3078///
3079/// \param a
3080/// A 128-bit vector of [4 x float].
3081/// \param b
3082/// A 128-bit vector of [4 x float].
3083/// \param c
3084/// An immediate integer operand, with bits [4:0] specifying which comparison
3085/// operation to use: \n
3086/// 0x00: Equal (ordered, non-signaling) \n
3087/// 0x01: Less-than (ordered, signaling) \n
3088/// 0x02: Less-than-or-equal (ordered, signaling) \n
3089/// 0x03: Unordered (non-signaling) \n
3090/// 0x04: Not-equal (unordered, non-signaling) \n
3091/// 0x05: Not-less-than (unordered, signaling) \n
3092/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3093/// 0x07: Ordered (non-signaling) \n
3094/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3095#define _mm_cmp_ss(a, b, c) \
3096 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3097
3098#define _MM_ALIGN16 __attribute__((aligned(16)))
3099
3100#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3101
3102#define _MM_EXCEPT_INVALID (0x0001U)
3103#define _MM_EXCEPT_DENORM (0x0002U)
3104#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3105#define _MM_EXCEPT_OVERFLOW (0x0008U)
3106#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3107#define _MM_EXCEPT_INEXACT (0x0020U)
3108#define _MM_EXCEPT_MASK (0x003fU)
3109
3110#define _MM_MASK_INVALID (0x0080U)
3111#define _MM_MASK_DENORM (0x0100U)
3112#define _MM_MASK_DIV_ZERO (0x0200U)
3113#define _MM_MASK_OVERFLOW (0x0400U)
3114#define _MM_MASK_UNDERFLOW (0x0800U)
3115#define _MM_MASK_INEXACT (0x1000U)
3116#define _MM_MASK_MASK (0x1f80U)
3117
3118#define _MM_ROUND_NEAREST (0x0000U)
3119#define _MM_ROUND_DOWN (0x2000U)
3120#define _MM_ROUND_UP (0x4000U)
3121#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3122#define _MM_ROUND_MASK (0x6000U)
3123
3124#define _MM_FLUSH_ZERO_MASK (0x8000U)
3125#define _MM_FLUSH_ZERO_ON (0x8000U)
3126#define _MM_FLUSH_ZERO_OFF (0x0000U)
3127
3128#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3129#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3130#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3131#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3132
3133#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3134#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3135#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3136#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3137
3138#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3139do { \
3140 __m128 tmp3, tmp2, tmp1, tmp0; \
3141 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3142 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3143 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3144 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3145 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3146 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3147 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3148 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3149} while (0)
3150
3151/* Aliases for compatibility. */
3152#define _m_pextrw _mm_extract_pi16
3153#define _m_pinsrw _mm_insert_pi16
3154#define _m_pmaxsw _mm_max_pi16
3155#define _m_pmaxub _mm_max_pu8
3156#define _m_pminsw _mm_min_pi16
3157#define _m_pminub _mm_min_pu8
3158#define _m_pmovmskb _mm_movemask_pi8
3159#define _m_pmulhuw _mm_mulhi_pu16
3160#define _m_pshufw _mm_shuffle_pi16
3161#define _m_maskmovq _mm_maskmove_si64
3162#define _m_pavgb _mm_avg_pu8
3163#define _m_pavgw _mm_avg_pu16
3164#define _m_psadbw _mm_sad_pu8
3165#define _m_ _mm_
3166
3167#undef __trunc64
3168#undef __zext128
3169#undef __anyext128
3170#undef __zeroupper64
3171#undef __DEFAULT_FN_ATTRS
3172#undef __DEFAULT_FN_ATTRS_CONSTEXPR
3173#undef __DEFAULT_FN_ATTRS_SSE2
3174#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3175
3176/* Ugly hack for backwards-compatibility (compatible with gcc) */
3177#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3178#include <emmintrin.h>
3179#endif
3180
3181#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
static __inline__ uint32_t uint32_t __y
Definition arm_acle.h:125
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ void int __a
Definition emmintrin.h:4076
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition mmintrin.h:148
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition mmintrin.h:1273
#define __DEFAULT_FN_ATTRS_SSE2
Definition mmintrin.h:47
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1178
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1500
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2557
static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2417
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:267
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2801
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:579
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1707
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:234
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:954
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:531
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2859
#define __anyext128(x)
Definition xmmintrin.h:56
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:557
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2538
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1478
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1412
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1681
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:510
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2435
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:160
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
Definition xmmintrin.h:48
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1843
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:775
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1154
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2940
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2176
#define __zeroupper64(x)
Definition xmmintrin.h:59
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1896
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:852
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1977
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1273
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:119
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1610
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2780
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:79
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1724
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:605
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2579
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:98
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:320
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:486
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2055
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1951
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1297
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:727
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:1105
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2137
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1632
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1933
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2759
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:218
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:412
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2382
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:303
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1345
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:451
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1130
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1771
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2097
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2822
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1820
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:366
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2254
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2235
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1202
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1390
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:677
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2346
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1321
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2965
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2842
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2076
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:902
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1744
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2895
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:284
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2118
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:468
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:250
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition xmmintrin.h:753
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:3014
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2916
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2877
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1568
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1458
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1522
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2990
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:430
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2364
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1882
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1006
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:825
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2034
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:927
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:979
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1030
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2018
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:653
#define __trunc64(x)
Definition xmmintrin.h:51
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1249
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2504
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:802
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2157
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:2004
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:345
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:627
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:139
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:391
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:200
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1368
#define __zext128(x)
Definition xmmintrin.h:53
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1226
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1589
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1916
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2400
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:875
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1057
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:703
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1081
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1860
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1798
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:179