clang 22.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef float __v4sf __attribute__((__vector_size__(16)));
20typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
23
24/* Unsigned types */
25typedef unsigned int __v4su __attribute__((__vector_size__(16)));
26typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
27typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
28
29/* This header should only be included in a hosted environment as it depends on
30 * a standard library to provide allocation routines. */
31#if __STDC_HOSTED__
32#include <mm_malloc.h>
33#endif
34
35/* Define the default attributes for the functions in this file. */
36#define __DEFAULT_FN_ATTRS \
37 __attribute__((__always_inline__, __nodebug__, __target__("sse"), \
38 __min_vector_width__(128)))
39#define __DEFAULT_FN_ATTRS_SSE2 \
40 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
41 __min_vector_width__(128)))
42
43#if defined(__cplusplus) && (__cplusplus >= 201103L)
44#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
45#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
46#else
47#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
48#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
49#endif
50
51#define __trunc64(x) \
52 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
53#define __zext128(x) \
54 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
55 1, 2, 3)
56#define __anyext128(x) \
57 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
58 1, -1, -1)
59#define __zeroupper64(x) \
60 (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
61 1, 4, 5)
62
63/// Adds the 32-bit float values in the low-order bits of the operands.
64///
65/// \headerfile <x86intrin.h>
66///
67/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
68///
69/// \param __a
70/// A 128-bit vector of [4 x float] containing one of the source operands.
71/// The lower 32 bits of this operand are used in the calculation.
72/// \param __b
73/// A 128-bit vector of [4 x float] containing one of the source operands.
74/// The lower 32 bits of this operand are used in the calculation.
75/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
76/// of the lower 32 bits of both operands. The upper 96 bits are copied from
77/// the upper 96 bits of the first source operand.
78static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
79_mm_add_ss(__m128 __a, __m128 __b) {
80 __a[0] += __b[0];
81 return __a;
82}
83
84/// Adds two 128-bit vectors of [4 x float], and returns the results of
85/// the addition.
86///
87/// \headerfile <x86intrin.h>
88///
89/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
90///
91/// \param __a
92/// A 128-bit vector of [4 x float] containing one of the source operands.
93/// \param __b
94/// A 128-bit vector of [4 x float] containing one of the source operands.
95/// \returns A 128-bit vector of [4 x float] containing the sums of both
96/// operands.
97static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
98_mm_add_ps(__m128 __a, __m128 __b) {
99 return (__m128)((__v4sf)__a + (__v4sf)__b);
100}
101
102/// Subtracts the 32-bit float value in the low-order bits of the second
103/// operand from the corresponding value in the first operand.
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
108///
109/// \param __a
110/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
111/// of this operand are used in the calculation.
112/// \param __b
113/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
114/// bits of this operand are used in the calculation.
115/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
116/// difference of the lower 32 bits of both operands. The upper 96 bits are
117/// copied from the upper 96 bits of the first source operand.
118static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
119_mm_sub_ss(__m128 __a, __m128 __b) {
120 __a[0] -= __b[0];
121 return __a;
122}
123
124/// Subtracts each of the values of the second operand from the first
125/// operand, both of which are 128-bit vectors of [4 x float] and returns
126/// the results of the subtraction.
127///
128/// \headerfile <x86intrin.h>
129///
130/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
131///
132/// \param __a
133/// A 128-bit vector of [4 x float] containing the minuend.
134/// \param __b
135/// A 128-bit vector of [4 x float] containing the subtrahend.
136/// \returns A 128-bit vector of [4 x float] containing the differences between
137/// both operands.
138static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
139_mm_sub_ps(__m128 __a, __m128 __b) {
140 return (__m128)((__v4sf)__a - (__v4sf)__b);
141}
142
143/// Multiplies two 32-bit float values in the low-order bits of the
144/// operands.
145///
146/// \headerfile <x86intrin.h>
147///
148/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
149///
150/// \param __a
151/// A 128-bit vector of [4 x float] containing one of the source operands.
152/// The lower 32 bits of this operand are used in the calculation.
153/// \param __b
154/// A 128-bit vector of [4 x float] containing one of the source operands.
155/// The lower 32 bits of this operand are used in the calculation.
156/// \returns A 128-bit vector of [4 x float] containing the product of the lower
157/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
158/// bits of the first source operand.
159static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
160_mm_mul_ss(__m128 __a, __m128 __b) {
161 __a[0] *= __b[0];
162 return __a;
163}
164
165/// Multiplies two 128-bit vectors of [4 x float] and returns the
166/// results of the multiplication.
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
171///
172/// \param __a
173/// A 128-bit vector of [4 x float] containing one of the source operands.
174/// \param __b
175/// A 128-bit vector of [4 x float] containing one of the source operands.
176/// \returns A 128-bit vector of [4 x float] containing the products of both
177/// operands.
178static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
179_mm_mul_ps(__m128 __a, __m128 __b) {
180 return (__m128)((__v4sf)__a * (__v4sf)__b);
181}
182
183/// Divides the value in the low-order 32 bits of the first operand by
184/// the corresponding value in the second operand.
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
189///
190/// \param __a
191/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
192/// bits of this operand are used in the calculation.
193/// \param __b
194/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
195/// of this operand are used in the calculation.
196/// \returns A 128-bit vector of [4 x float] containing the quotients of the
197/// lower 32 bits of both operands. The upper 96 bits are copied from the
198/// upper 96 bits of the first source operand.
199static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
200_mm_div_ss(__m128 __a, __m128 __b) {
201 __a[0] /= __b[0];
202 return __a;
203}
204
205/// Divides two 128-bit vectors of [4 x float].
206///
207/// \headerfile <x86intrin.h>
208///
209/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
210///
211/// \param __a
212/// A 128-bit vector of [4 x float] containing the dividend.
213/// \param __b
214/// A 128-bit vector of [4 x float] containing the divisor.
215/// \returns A 128-bit vector of [4 x float] containing the quotients of both
216/// operands.
217static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
218_mm_div_ps(__m128 __a, __m128 __b) {
219 return (__m128)((__v4sf)__a / (__v4sf)__b);
220}
221
222/// Calculates the square root of the value stored in the low-order bits
223/// of a 128-bit vector of [4 x float].
224///
225/// \headerfile <x86intrin.h>
226///
227/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
228///
229/// \param __a
230/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
231/// used in the calculation.
232/// \returns A 128-bit vector of [4 x float] containing the square root of the
233/// value in the low-order bits of the operand.
234static __inline__ __m128 __DEFAULT_FN_ATTRS
236{
237 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
238}
239
240/// Calculates the square roots of the values stored in a 128-bit vector
241/// of [4 x float].
242///
243/// \headerfile <x86intrin.h>
244///
245/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
246///
247/// \param __a
248/// A 128-bit vector of [4 x float].
249/// \returns A 128-bit vector of [4 x float] containing the square roots of the
250/// values in the operand.
251static __inline__ __m128 __DEFAULT_FN_ATTRS
253{
254 return __builtin_ia32_sqrtps((__v4sf)__a);
255}
256
257/// Calculates the approximate reciprocal of the value stored in the
258/// low-order bits of a 128-bit vector of [4 x float].
259///
260/// \headerfile <x86intrin.h>
261///
262/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
263///
264/// \param __a
265/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
266/// used in the calculation.
267/// \returns A 128-bit vector of [4 x float] containing the approximate
268/// reciprocal of the value in the low-order bits of the operand.
269static __inline__ __m128 __DEFAULT_FN_ATTRS
271{
272 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
273}
274
275/// Calculates the approximate reciprocals of the values stored in a
276/// 128-bit vector of [4 x float].
277///
278/// \headerfile <x86intrin.h>
279///
280/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
281///
282/// \param __a
283/// A 128-bit vector of [4 x float].
284/// \returns A 128-bit vector of [4 x float] containing the approximate
285/// reciprocals of the values in the operand.
286static __inline__ __m128 __DEFAULT_FN_ATTRS
288{
289 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
290}
291
292/// Calculates the approximate reciprocal of the square root of the value
293/// stored in the low-order bits of a 128-bit vector of [4 x float].
294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
298///
299/// \param __a
300/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
301/// used in the calculation.
302/// \returns A 128-bit vector of [4 x float] containing the approximate
303/// reciprocal of the square root of the value in the low-order bits of the
304/// operand.
305static __inline__ __m128 __DEFAULT_FN_ATTRS
307{
308 return __builtin_ia32_rsqrtss((__v4sf)__a);
309}
310
311/// Calculates the approximate reciprocals of the square roots of the
312/// values stored in a 128-bit vector of [4 x float].
313///
314/// \headerfile <x86intrin.h>
315///
316/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
317///
318/// \param __a
319/// A 128-bit vector of [4 x float].
320/// \returns A 128-bit vector of [4 x float] containing the approximate
321/// reciprocals of the square roots of the values in the operand.
322static __inline__ __m128 __DEFAULT_FN_ATTRS
324{
325 return __builtin_ia32_rsqrtps((__v4sf)__a);
326}
327
328/// Compares two 32-bit float values in the low-order bits of both
329/// operands and returns the lesser value in the low-order bits of the
330/// vector of [4 x float].
331///
332/// If either value in a comparison is NaN, returns the value from \a __b.
333///
334/// \headerfile <x86intrin.h>
335///
336/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
337///
338/// \param __a
339/// A 128-bit vector of [4 x float] containing one of the operands. The lower
340/// 32 bits of this operand are used in the comparison.
341/// \param __b
342/// A 128-bit vector of [4 x float] containing one of the operands. The lower
343/// 32 bits of this operand are used in the comparison.
344/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
345/// minimum value between both operands. The upper 96 bits are copied from
346/// the upper 96 bits of the first source operand.
347static __inline__ __m128 __DEFAULT_FN_ATTRS
348_mm_min_ss(__m128 __a, __m128 __b)
349{
350 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
351}
352
353/// Compares two 128-bit vectors of [4 x float] and returns the lesser
354/// of each pair of values.
355///
356/// If either value in a comparison is NaN, returns the value from \a __b.
357///
358/// \headerfile <x86intrin.h>
359///
360/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
361///
362/// \param __a
363/// A 128-bit vector of [4 x float] containing one of the operands.
364/// \param __b
365/// A 128-bit vector of [4 x float] containing one of the operands.
366/// \returns A 128-bit vector of [4 x float] containing the minimum values
367/// between both operands.
368static __inline__ __m128 __DEFAULT_FN_ATTRS
369_mm_min_ps(__m128 __a, __m128 __b)
370{
371 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
372}
373
374/// Compares two 32-bit float values in the low-order bits of both
375/// operands and returns the greater value in the low-order bits of a 128-bit
376/// vector of [4 x float].
377///
378/// If either value in a comparison is NaN, returns the value from \a __b.
379///
380/// \headerfile <x86intrin.h>
381///
382/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
383///
384/// \param __a
385/// A 128-bit vector of [4 x float] containing one of the operands. The lower
386/// 32 bits of this operand are used in the comparison.
387/// \param __b
388/// A 128-bit vector of [4 x float] containing one of the operands. The lower
389/// 32 bits of this operand are used in the comparison.
390/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
391/// maximum value between both operands. The upper 96 bits are copied from
392/// the upper 96 bits of the first source operand.
393static __inline__ __m128 __DEFAULT_FN_ATTRS
394_mm_max_ss(__m128 __a, __m128 __b)
395{
396 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
397}
398
399/// Compares two 128-bit vectors of [4 x float] and returns the greater
400/// of each pair of values.
401///
402/// If either value in a comparison is NaN, returns the value from \a __b.
403///
404/// \headerfile <x86intrin.h>
405///
406/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
407///
408/// \param __a
409/// A 128-bit vector of [4 x float] containing one of the operands.
410/// \param __b
411/// A 128-bit vector of [4 x float] containing one of the operands.
412/// \returns A 128-bit vector of [4 x float] containing the maximum values
413/// between both operands.
414static __inline__ __m128 __DEFAULT_FN_ATTRS
415_mm_max_ps(__m128 __a, __m128 __b)
416{
417 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
418}
419
420/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
421///
422/// \headerfile <x86intrin.h>
423///
424/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
425///
426/// \param __a
427/// A 128-bit vector containing one of the source operands.
428/// \param __b
429/// A 128-bit vector containing one of the source operands.
430/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
431/// values between both operands.
432static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
433_mm_and_ps(__m128 __a, __m128 __b) {
434 return (__m128)((__v4su)__a & (__v4su)__b);
435}
436
437/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
438/// the one's complement of the values contained in the first source
439/// operand.
440///
441/// \headerfile <x86intrin.h>
442///
443/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
444///
445/// \param __a
446/// A 128-bit vector of [4 x float] containing the first source operand. The
447/// one's complement of this value is used in the bitwise AND.
448/// \param __b
449/// A 128-bit vector of [4 x float] containing the second source operand.
450/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
451/// one's complement of the first operand and the values in the second
452/// operand.
453static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
454_mm_andnot_ps(__m128 __a, __m128 __b) {
455 return (__m128)(~(__v4su)__a & (__v4su)__b);
456}
457
458/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
459///
460/// \headerfile <x86intrin.h>
461///
462/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
463///
464/// \param __a
465/// A 128-bit vector of [4 x float] containing one of the source operands.
466/// \param __b
467/// A 128-bit vector of [4 x float] containing one of the source operands.
468/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
469/// values between both operands.
470static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
471_mm_or_ps(__m128 __a, __m128 __b) {
472 return (__m128)((__v4su)__a | (__v4su)__b);
473}
474
475/// Performs a bitwise exclusive OR of two 128-bit vectors of
476/// [4 x float].
477///
478/// \headerfile <x86intrin.h>
479///
480/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
481///
482/// \param __a
483/// A 128-bit vector of [4 x float] containing one of the source operands.
484/// \param __b
485/// A 128-bit vector of [4 x float] containing one of the source operands.
486/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
487/// of the values between both operands.
488static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
489_mm_xor_ps(__m128 __a, __m128 __b) {
490 return (__m128)((__v4su)__a ^ (__v4su)__b);
491}
492
493/// Compares two 32-bit float values in the low-order bits of both
494/// operands for equality.
495///
496/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
497/// low-order bits of a vector [4 x float].
498/// If either value in a comparison is NaN, returns false.
499///
500/// \headerfile <x86intrin.h>
501///
502/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
503///
504/// \param __a
505/// A 128-bit vector of [4 x float] containing one of the operands. The lower
506/// 32 bits of this operand are used in the comparison.
507/// \param __b
508/// A 128-bit vector of [4 x float] containing one of the operands. The lower
509/// 32 bits of this operand are used in the comparison.
510/// \returns A 128-bit vector of [4 x float] containing the comparison results
511/// in the low-order bits.
512static __inline__ __m128 __DEFAULT_FN_ATTRS
513_mm_cmpeq_ss(__m128 __a, __m128 __b)
514{
515 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
516}
517
518/// Compares each of the corresponding 32-bit float values of the
519/// 128-bit vectors of [4 x float] for equality.
520///
521/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
522/// If either value in a comparison is NaN, returns false.
523///
524/// \headerfile <x86intrin.h>
525///
526/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
527///
528/// \param __a
529/// A 128-bit vector of [4 x float].
530/// \param __b
531/// A 128-bit vector of [4 x float].
532/// \returns A 128-bit vector of [4 x float] containing the comparison results.
533static __inline__ __m128 __DEFAULT_FN_ATTRS
534_mm_cmpeq_ps(__m128 __a, __m128 __b)
535{
536 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
537}
538
539/// Compares two 32-bit float values in the low-order bits of both
540/// operands to determine if the value in the first operand is less than the
541/// corresponding value in the second operand.
542///
543/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
544/// low-order bits of a vector of [4 x float].
545/// If either value in a comparison is NaN, returns false.
546///
547/// \headerfile <x86intrin.h>
548///
549/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
550///
551/// \param __a
552/// A 128-bit vector of [4 x float] containing one of the operands. The lower
553/// 32 bits of this operand are used in the comparison.
554/// \param __b
555/// A 128-bit vector of [4 x float] containing one of the operands. The lower
556/// 32 bits of this operand are used in the comparison.
557/// \returns A 128-bit vector of [4 x float] containing the comparison results
558/// in the low-order bits.
559static __inline__ __m128 __DEFAULT_FN_ATTRS
560_mm_cmplt_ss(__m128 __a, __m128 __b)
561{
562 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
563}
564
565/// Compares each of the corresponding 32-bit float values of the
566/// 128-bit vectors of [4 x float] to determine if the values in the first
567/// operand are less than those in the second operand.
568///
569/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
570/// If either value in a comparison is NaN, returns false.
571///
572/// \headerfile <x86intrin.h>
573///
574/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
575///
576/// \param __a
577/// A 128-bit vector of [4 x float].
578/// \param __b
579/// A 128-bit vector of [4 x float].
580/// \returns A 128-bit vector of [4 x float] containing the comparison results.
581static __inline__ __m128 __DEFAULT_FN_ATTRS
582_mm_cmplt_ps(__m128 __a, __m128 __b)
583{
584 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
585}
586
587/// Compares two 32-bit float values in the low-order bits of both
588/// operands to determine if the value in the first operand is less than or
589/// equal to the corresponding value in the second operand.
590///
591/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
592/// the low-order bits of a vector of [4 x float].
593/// If either value in a comparison is NaN, returns false.
594///
595/// \headerfile <x86intrin.h>
596///
597/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
598///
599/// \param __a
600/// A 128-bit vector of [4 x float] containing one of the operands. The lower
601/// 32 bits of this operand are used in the comparison.
602/// \param __b
603/// A 128-bit vector of [4 x float] containing one of the operands. The lower
604/// 32 bits of this operand are used in the comparison.
605/// \returns A 128-bit vector of [4 x float] containing the comparison results
606/// in the low-order bits.
607static __inline__ __m128 __DEFAULT_FN_ATTRS
608_mm_cmple_ss(__m128 __a, __m128 __b)
609{
610 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
611}
612
613/// Compares each of the corresponding 32-bit float values of the
614/// 128-bit vectors of [4 x float] to determine if the values in the first
615/// operand are less than or equal to those in the second operand.
616///
617/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
618/// If either value in a comparison is NaN, returns false.
619///
620/// \headerfile <x86intrin.h>
621///
622/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
623///
624/// \param __a
625/// A 128-bit vector of [4 x float].
626/// \param __b
627/// A 128-bit vector of [4 x float].
628/// \returns A 128-bit vector of [4 x float] containing the comparison results.
629static __inline__ __m128 __DEFAULT_FN_ATTRS
630_mm_cmple_ps(__m128 __a, __m128 __b)
631{
632 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
633}
634
635/// Compares two 32-bit float values in the low-order bits of both
636/// operands to determine if the value in the first operand is greater than
637/// the corresponding value in the second operand.
638///
639/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
640/// low-order bits of a vector of [4 x float].
641/// If either value in a comparison is NaN, returns false.
642///
643/// \headerfile <x86intrin.h>
644///
645/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
646///
647/// \param __a
648/// A 128-bit vector of [4 x float] containing one of the operands. The lower
649/// 32 bits of this operand are used in the comparison.
650/// \param __b
651/// A 128-bit vector of [4 x float] containing one of the operands. The lower
652/// 32 bits of this operand are used in the comparison.
653/// \returns A 128-bit vector of [4 x float] containing the comparison results
654/// in the low-order bits.
655static __inline__ __m128 __DEFAULT_FN_ATTRS
656_mm_cmpgt_ss(__m128 __a, __m128 __b)
657{
658 return (__m128)__builtin_shufflevector((__v4sf)__a,
659 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
660 4, 1, 2, 3);
661}
662
663/// Compares each of the corresponding 32-bit float values of the
664/// 128-bit vectors of [4 x float] to determine if the values in the first
665/// operand are greater than those in the second operand.
666///
667/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
668/// If either value in a comparison is NaN, returns false.
669///
670/// \headerfile <x86intrin.h>
671///
672/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
673///
674/// \param __a
675/// A 128-bit vector of [4 x float].
676/// \param __b
677/// A 128-bit vector of [4 x float].
678/// \returns A 128-bit vector of [4 x float] containing the comparison results.
679static __inline__ __m128 __DEFAULT_FN_ATTRS
680_mm_cmpgt_ps(__m128 __a, __m128 __b)
681{
682 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
683}
684
685/// Compares two 32-bit float values in the low-order bits of both
686/// operands to determine if the value in the first operand is greater than
687/// or equal to the corresponding value in the second operand.
688///
689/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
690/// low-order bits of a vector of [4 x float].
691/// If either value in a comparison is NaN, returns false.
692///
693/// \headerfile <x86intrin.h>
694///
695/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
696///
697/// \param __a
698/// A 128-bit vector of [4 x float] containing one of the operands. The lower
699/// 32 bits of this operand are used in the comparison.
700/// \param __b
701/// A 128-bit vector of [4 x float] containing one of the operands. The lower
702/// 32 bits of this operand are used in the comparison.
703/// \returns A 128-bit vector of [4 x float] containing the comparison results
704/// in the low-order bits.
705static __inline__ __m128 __DEFAULT_FN_ATTRS
706_mm_cmpge_ss(__m128 __a, __m128 __b)
707{
708 return (__m128)__builtin_shufflevector((__v4sf)__a,
709 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
710 4, 1, 2, 3);
711}
712
713/// Compares each of the corresponding 32-bit float values of the
714/// 128-bit vectors of [4 x float] to determine if the values in the first
715/// operand are greater than or equal to those in the second operand.
716///
717/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
718/// If either value in a comparison is NaN, returns false.
719///
720/// \headerfile <x86intrin.h>
721///
722/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
723///
724/// \param __a
725/// A 128-bit vector of [4 x float].
726/// \param __b
727/// A 128-bit vector of [4 x float].
728/// \returns A 128-bit vector of [4 x float] containing the comparison results.
729static __inline__ __m128 __DEFAULT_FN_ATTRS
730_mm_cmpge_ps(__m128 __a, __m128 __b)
731{
732 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
733}
734
735/// Compares two 32-bit float values in the low-order bits of both operands
736/// for inequality.
737///
738/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
739/// low-order bits of a vector of [4 x float].
740/// If either value in a comparison is NaN, returns true.
741///
742/// \headerfile <x86intrin.h>
743///
744/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
745/// instructions.
746///
747/// \param __a
748/// A 128-bit vector of [4 x float] containing one of the operands. The lower
749/// 32 bits of this operand are used in the comparison.
750/// \param __b
751/// A 128-bit vector of [4 x float] containing one of the operands. The lower
752/// 32 bits of this operand are used in the comparison.
753/// \returns A 128-bit vector of [4 x float] containing the comparison results
754/// in the low-order bits.
755static __inline__ __m128 __DEFAULT_FN_ATTRS
756_mm_cmpneq_ss(__m128 __a, __m128 __b)
757{
758 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
759}
760
761/// Compares each of the corresponding 32-bit float values of the
762/// 128-bit vectors of [4 x float] for inequality.
763///
764/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
765/// If either value in a comparison is NaN, returns true.
766///
767/// \headerfile <x86intrin.h>
768///
769/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
770/// instructions.
771///
772/// \param __a
773/// A 128-bit vector of [4 x float].
774/// \param __b
775/// A 128-bit vector of [4 x float].
776/// \returns A 128-bit vector of [4 x float] containing the comparison results.
777static __inline__ __m128 __DEFAULT_FN_ATTRS
778_mm_cmpneq_ps(__m128 __a, __m128 __b)
779{
780 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
781}
782
783/// Compares two 32-bit float values in the low-order bits of both
784/// operands to determine if the value in the first operand is not less than
785/// the corresponding value in the second operand.
786///
787/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
788/// low-order bits of a vector of [4 x float].
789/// If either value in a comparison is NaN, returns true.
790///
791/// \headerfile <x86intrin.h>
792///
793/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
794/// instructions.
795///
796/// \param __a
797/// A 128-bit vector of [4 x float] containing one of the operands. The lower
798/// 32 bits of this operand are used in the comparison.
799/// \param __b
800/// A 128-bit vector of [4 x float] containing one of the operands. The lower
801/// 32 bits of this operand are used in the comparison.
802/// \returns A 128-bit vector of [4 x float] containing the comparison results
803/// in the low-order bits.
804static __inline__ __m128 __DEFAULT_FN_ATTRS
805_mm_cmpnlt_ss(__m128 __a, __m128 __b)
806{
807 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
808}
809
810/// Compares each of the corresponding 32-bit float values of the
811/// 128-bit vectors of [4 x float] to determine if the values in the first
812/// operand are not less than those in the second operand.
813///
814/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
815/// If either value in a comparison is NaN, returns true.
816///
817/// \headerfile <x86intrin.h>
818///
819/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
820/// instructions.
821///
822/// \param __a
823/// A 128-bit vector of [4 x float].
824/// \param __b
825/// A 128-bit vector of [4 x float].
826/// \returns A 128-bit vector of [4 x float] containing the comparison results.
827static __inline__ __m128 __DEFAULT_FN_ATTRS
828_mm_cmpnlt_ps(__m128 __a, __m128 __b)
829{
830 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
831}
832
833/// Compares two 32-bit float values in the low-order bits of both
834/// operands to determine if the value in the first operand is not less than
835/// or equal to the corresponding value in the second operand.
836///
837/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
838/// low-order bits of a vector of [4 x float].
839/// If either value in a comparison is NaN, returns true.
840///
841/// \headerfile <x86intrin.h>
842///
843/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
844/// instructions.
845///
846/// \param __a
847/// A 128-bit vector of [4 x float] containing one of the operands. The lower
848/// 32 bits of this operand are used in the comparison.
849/// \param __b
850/// A 128-bit vector of [4 x float] containing one of the operands. The lower
851/// 32 bits of this operand are used in the comparison.
852/// \returns A 128-bit vector of [4 x float] containing the comparison results
853/// in the low-order bits.
854static __inline__ __m128 __DEFAULT_FN_ATTRS
855_mm_cmpnle_ss(__m128 __a, __m128 __b)
856{
857 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
858}
859
860/// Compares each of the corresponding 32-bit float values of the
861/// 128-bit vectors of [4 x float] to determine if the values in the first
862/// operand are not less than or equal to those in the second operand.
863///
864/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
865/// If either value in a comparison is NaN, returns true.
866///
867/// \headerfile <x86intrin.h>
868///
869/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
870/// instructions.
871///
872/// \param __a
873/// A 128-bit vector of [4 x float].
874/// \param __b
875/// A 128-bit vector of [4 x float].
876/// \returns A 128-bit vector of [4 x float] containing the comparison results.
877static __inline__ __m128 __DEFAULT_FN_ATTRS
878_mm_cmpnle_ps(__m128 __a, __m128 __b)
879{
880 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
881}
882
883/// Compares two 32-bit float values in the low-order bits of both
884/// operands to determine if the value in the first operand is not greater
885/// than the corresponding value in the second operand.
886///
887/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
888/// low-order bits of a vector of [4 x float].
889/// If either value in a comparison is NaN, returns true.
890///
891/// \headerfile <x86intrin.h>
892///
893/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
894/// instructions.
895///
896/// \param __a
897/// A 128-bit vector of [4 x float] containing one of the operands. The lower
898/// 32 bits of this operand are used in the comparison.
899/// \param __b
900/// A 128-bit vector of [4 x float] containing one of the operands. The lower
901/// 32 bits of this operand are used in the comparison.
902/// \returns A 128-bit vector of [4 x float] containing the comparison results
903/// in the low-order bits.
904static __inline__ __m128 __DEFAULT_FN_ATTRS
905_mm_cmpngt_ss(__m128 __a, __m128 __b)
906{
907 return (__m128)__builtin_shufflevector((__v4sf)__a,
908 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
909 4, 1, 2, 3);
910}
911
912/// Compares each of the corresponding 32-bit float values of the
913/// 128-bit vectors of [4 x float] to determine if the values in the first
914/// operand are not greater than those in the second operand.
915///
916/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
917/// If either value in a comparison is NaN, returns true.
918///
919/// \headerfile <x86intrin.h>
920///
921/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
922/// instructions.
923///
924/// \param __a
925/// A 128-bit vector of [4 x float].
926/// \param __b
927/// A 128-bit vector of [4 x float].
928/// \returns A 128-bit vector of [4 x float] containing the comparison results.
929static __inline__ __m128 __DEFAULT_FN_ATTRS
930_mm_cmpngt_ps(__m128 __a, __m128 __b)
931{
932 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
933}
934
935/// Compares two 32-bit float values in the low-order bits of both
936/// operands to determine if the value in the first operand is not greater
937/// than or equal to the corresponding value in the second operand.
938///
939/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
940/// low-order bits of a vector of [4 x float].
941/// If either value in a comparison is NaN, returns true.
942///
943/// \headerfile <x86intrin.h>
944///
945/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
946/// instructions.
947///
948/// \param __a
949/// A 128-bit vector of [4 x float] containing one of the operands. The lower
950/// 32 bits of this operand are used in the comparison.
951/// \param __b
952/// A 128-bit vector of [4 x float] containing one of the operands. The lower
953/// 32 bits of this operand are used in the comparison.
954/// \returns A 128-bit vector of [4 x float] containing the comparison results
955/// in the low-order bits.
956static __inline__ __m128 __DEFAULT_FN_ATTRS
957_mm_cmpnge_ss(__m128 __a, __m128 __b)
958{
959 return (__m128)__builtin_shufflevector((__v4sf)__a,
960 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
961 4, 1, 2, 3);
962}
963
964/// Compares each of the corresponding 32-bit float values of the
965/// 128-bit vectors of [4 x float] to determine if the values in the first
966/// operand are not greater than or equal to those in the second operand.
967///
968/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
969/// If either value in a comparison is NaN, returns true.
970///
971/// \headerfile <x86intrin.h>
972///
973/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
974/// instructions.
975///
976/// \param __a
977/// A 128-bit vector of [4 x float].
978/// \param __b
979/// A 128-bit vector of [4 x float].
980/// \returns A 128-bit vector of [4 x float] containing the comparison results.
981static __inline__ __m128 __DEFAULT_FN_ATTRS
982_mm_cmpnge_ps(__m128 __a, __m128 __b)
983{
984 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
985}
986
987/// Compares two 32-bit float values in the low-order bits of both
988/// operands to determine if the value in the first operand is ordered with
989/// respect to the corresponding value in the second operand.
990///
991/// A pair of floating-point values are ordered with respect to each
992/// other if neither value is a NaN. Each comparison returns 0x0 for false,
993/// 0xFFFFFFFF for true.
994///
995/// \headerfile <x86intrin.h>
996///
997/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
998/// instructions.
999///
1000/// \param __a
1001/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1002/// 32 bits of this operand are used in the comparison.
1003/// \param __b
1004/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1005/// 32 bits of this operand are used in the comparison.
1006/// \returns A 128-bit vector of [4 x float] containing the comparison results
1007/// in the low-order bits.
1008static __inline__ __m128 __DEFAULT_FN_ATTRS
1009_mm_cmpord_ss(__m128 __a, __m128 __b)
1010{
1011 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1012}
1013
1014/// Compares each of the corresponding 32-bit float values of the
1015/// 128-bit vectors of [4 x float] to determine if the values in the first
1016/// operand are ordered with respect to those in the second operand.
1017///
1018/// A pair of floating-point values are ordered with respect to each
1019/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1020/// 0xFFFFFFFF for true.
1021///
1022/// \headerfile <x86intrin.h>
1023///
1024/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1025/// instructions.
1026///
1027/// \param __a
1028/// A 128-bit vector of [4 x float].
1029/// \param __b
1030/// A 128-bit vector of [4 x float].
1031/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1032static __inline__ __m128 __DEFAULT_FN_ATTRS
1033_mm_cmpord_ps(__m128 __a, __m128 __b)
1034{
1035 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1036}
1037
1038/// Compares two 32-bit float values in the low-order bits of both
1039/// operands to determine if the value in the first operand is unordered
1040/// with respect to the corresponding value in the second operand.
1041///
1042/// A pair of double-precision values are unordered with respect to each
1043/// other if one or both values are NaN. Each comparison returns 0x0 for
1044/// false, 0xFFFFFFFF for true.
1045///
1046/// \headerfile <x86intrin.h>
1047///
1048/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1049/// instructions.
1050///
1051/// \param __a
1052/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1053/// 32 bits of this operand are used in the comparison.
1054/// \param __b
1055/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1056/// 32 bits of this operand are used in the comparison.
1057/// \returns A 128-bit vector of [4 x float] containing the comparison results
1058/// in the low-order bits.
1059static __inline__ __m128 __DEFAULT_FN_ATTRS
1060_mm_cmpunord_ss(__m128 __a, __m128 __b)
1061{
1062 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1063}
1064
1065/// Compares each of the corresponding 32-bit float values of the
1066/// 128-bit vectors of [4 x float] to determine if the values in the first
1067/// operand are unordered with respect to those in the second operand.
1068///
1069/// A pair of double-precision values are unordered with respect to each
1070/// other if one or both values are NaN. Each comparison returns 0x0 for
1071/// false, 0xFFFFFFFFFFFFFFFF for true.
1072///
1073/// \headerfile <x86intrin.h>
1074///
1075/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1076/// instructions.
1077///
1078/// \param __a
1079/// A 128-bit vector of [4 x float].
1080/// \param __b
1081/// A 128-bit vector of [4 x float].
1082/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1083static __inline__ __m128 __DEFAULT_FN_ATTRS
1084_mm_cmpunord_ps(__m128 __a, __m128 __b)
1085{
1086 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1087}
1088
1089/// Compares two 32-bit float values in the low-order bits of both
1090/// operands for equality.
1091///
1092/// The comparison returns 0 for false, 1 for true. If either value in a
1093/// comparison is NaN, returns 0.
1094///
1095/// \headerfile <x86intrin.h>
1096///
1097/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1098/// instructions.
1099///
1100/// \param __a
1101/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1102/// used in the comparison.
1103/// \param __b
1104/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1105/// used in the comparison.
1106/// \returns An integer containing the comparison results.
1107static __inline__ int __DEFAULT_FN_ATTRS
1108_mm_comieq_ss(__m128 __a, __m128 __b)
1109{
1110 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1111}
1112
1113/// Compares two 32-bit float values in the low-order bits of both
1114/// operands to determine if the first operand is less than the second
1115/// operand.
1116///
1117/// The comparison returns 0 for false, 1 for true. If either value in a
1118/// comparison is NaN, returns 0.
1119///
1120/// \headerfile <x86intrin.h>
1121///
1122/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1123/// instructions.
1124///
1125/// \param __a
1126/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1127/// used in the comparison.
1128/// \param __b
1129/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130/// used in the comparison.
1131/// \returns An integer containing the comparison results.
1132static __inline__ int __DEFAULT_FN_ATTRS
1133_mm_comilt_ss(__m128 __a, __m128 __b)
1134{
1135 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1136}
1137
1138/// Compares two 32-bit float values in the low-order bits of both
1139/// operands to determine if the first operand is less than or equal to the
1140/// second operand.
1141///
1142/// The comparison returns 0 for false, 1 for true. If either value in a
1143/// comparison is NaN, returns 0.
1144///
1145/// \headerfile <x86intrin.h>
1146///
1147/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1148///
1149/// \param __a
1150/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1151/// used in the comparison.
1152/// \param __b
1153/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154/// used in the comparison.
1155/// \returns An integer containing the comparison results.
1156static __inline__ int __DEFAULT_FN_ATTRS
1157_mm_comile_ss(__m128 __a, __m128 __b)
1158{
1159 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1160}
1161
1162/// Compares two 32-bit float values in the low-order bits of both
1163/// operands to determine if the first operand is greater than the second
1164/// operand.
1165///
1166/// The comparison returns 0 for false, 1 for true. If either value in a
1167/// comparison is NaN, returns 0.
1168///
1169/// \headerfile <x86intrin.h>
1170///
1171/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1172///
1173/// \param __a
1174/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1175/// used in the comparison.
1176/// \param __b
1177/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178/// used in the comparison.
1179/// \returns An integer containing the comparison results.
1180static __inline__ int __DEFAULT_FN_ATTRS
1181_mm_comigt_ss(__m128 __a, __m128 __b)
1182{
1183 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1184}
1185
1186/// Compares two 32-bit float values in the low-order bits of both
1187/// operands to determine if the first operand is greater than or equal to
1188/// the second operand.
1189///
1190/// The comparison returns 0 for false, 1 for true. If either value in a
1191/// comparison is NaN, returns 0.
1192///
1193/// \headerfile <x86intrin.h>
1194///
1195/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1196///
1197/// \param __a
1198/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1199/// used in the comparison.
1200/// \param __b
1201/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1202/// used in the comparison.
1203/// \returns An integer containing the comparison results.
1204static __inline__ int __DEFAULT_FN_ATTRS
1205_mm_comige_ss(__m128 __a, __m128 __b)
1206{
1207 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1208}
1209
1210/// Compares two 32-bit float values in the low-order bits of both
1211/// operands to determine if the first operand is not equal to the second
1212/// operand.
1213///
1214/// The comparison returns 0 for false, 1 for true. If either value in a
1215/// comparison is NaN, returns 1.
1216///
1217/// \headerfile <x86intrin.h>
1218///
1219/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1220///
1221/// \param __a
1222/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1223/// used in the comparison.
1224/// \param __b
1225/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1226/// used in the comparison.
1227/// \returns An integer containing the comparison results.
1228static __inline__ int __DEFAULT_FN_ATTRS
1229_mm_comineq_ss(__m128 __a, __m128 __b)
1230{
1231 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1232}
1233
1234/// Performs an unordered comparison of two 32-bit float values using
1235/// the low-order bits of both operands to determine equality.
1236///
1237/// The comparison returns 0 for false, 1 for true. If either value in a
1238/// comparison is NaN, returns 0.
1239///
1240/// \headerfile <x86intrin.h>
1241///
1242/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1243///
1244/// \param __a
1245/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1246/// used in the comparison.
1247/// \param __b
1248/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1249/// used in the comparison.
1250/// \returns An integer containing the comparison results.
1251static __inline__ int __DEFAULT_FN_ATTRS
1252_mm_ucomieq_ss(__m128 __a, __m128 __b)
1253{
1254 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1255}
1256
1257/// Performs an unordered comparison of two 32-bit float values using
1258/// the low-order bits of both operands to determine if the first operand is
1259/// less than the second operand.
1260///
1261/// The comparison returns 0 for false, 1 for true. If either value in a
1262/// comparison is NaN, returns 0.
1263///
1264/// \headerfile <x86intrin.h>
1265///
1266/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1267///
1268/// \param __a
1269/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1270/// used in the comparison.
1271/// \param __b
1272/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1273/// used in the comparison.
1274/// \returns An integer containing the comparison results.
1275static __inline__ int __DEFAULT_FN_ATTRS
1276_mm_ucomilt_ss(__m128 __a, __m128 __b)
1277{
1278 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1279}
1280
1281/// Performs an unordered comparison of two 32-bit float values using
1282/// the low-order bits of both operands to determine if the first operand is
1283/// less than or equal to the second operand.
1284///
1285/// The comparison returns 0 for false, 1 for true. If either value in a
1286/// comparison is NaN, returns 0.
1287///
1288/// \headerfile <x86intrin.h>
1289///
1290/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1291///
1292/// \param __a
1293/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1294/// used in the comparison.
1295/// \param __b
1296/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1297/// used in the comparison.
1298/// \returns An integer containing the comparison results.
1299static __inline__ int __DEFAULT_FN_ATTRS
1300_mm_ucomile_ss(__m128 __a, __m128 __b)
1301{
1302 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1303}
1304
1305/// Performs an unordered comparison of two 32-bit float values using
1306/// the low-order bits of both operands to determine if the first operand is
1307/// greater than the second operand.
1308///
1309/// The comparison returns 0 for false, 1 for true. If either value in a
1310/// comparison is NaN, returns 0.
1311///
1312/// \headerfile <x86intrin.h>
1313///
1314/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1315///
1316/// \param __a
1317/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1318/// used in the comparison.
1319/// \param __b
1320/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1321/// used in the comparison.
1322/// \returns An integer containing the comparison results.
1323static __inline__ int __DEFAULT_FN_ATTRS
1324_mm_ucomigt_ss(__m128 __a, __m128 __b)
1325{
1326 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1327}
1328
1329/// Performs an unordered comparison of two 32-bit float values using
1330/// the low-order bits of both operands to determine if the first operand is
1331/// greater than or equal to the second operand.
1332///
1333/// The comparison returns 0 for false, 1 for true. If either value in a
1334/// comparison is NaN, returns 0.
1335///
1336/// \headerfile <x86intrin.h>
1337///
1338/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1339///
1340/// \param __a
1341/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1342/// used in the comparison.
1343/// \param __b
1344/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1345/// used in the comparison.
1346/// \returns An integer containing the comparison results.
1347static __inline__ int __DEFAULT_FN_ATTRS
1348_mm_ucomige_ss(__m128 __a, __m128 __b)
1349{
1350 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1351}
1352
1353/// Performs an unordered comparison of two 32-bit float values using
1354/// the low-order bits of both operands to determine inequality.
1355///
1356/// The comparison returns 0 for false, 1 for true. If either value in a
1357/// comparison is NaN, returns 0.
1358///
1359/// \headerfile <x86intrin.h>
1360///
1361/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1362///
1363/// \param __a
1364/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1365/// used in the comparison.
1366/// \param __b
1367/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1368/// used in the comparison.
1369/// \returns An integer containing the comparison results.
1370static __inline__ int __DEFAULT_FN_ATTRS
1371_mm_ucomineq_ss(__m128 __a, __m128 __b)
1372{
1373 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1374}
1375
1376/// Converts a float value contained in the lower 32 bits of a vector of
1377/// [4 x float] into a 32-bit integer.
1378///
1379/// If the converted value does not fit in a 32-bit integer, raises a
1380/// floating-point invalid exception. If the exception is masked, returns
1381/// the most negative integer.
1382///
1383/// \headerfile <x86intrin.h>
1384///
1385/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1386/// instructions.
1387///
1388/// \param __a
1389/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390/// used in the conversion.
1391/// \returns A 32-bit integer containing the converted value.
1392static __inline__ int __DEFAULT_FN_ATTRS
1394{
1395 return __builtin_ia32_cvtss2si((__v4sf)__a);
1396}
1397
1398/// Converts a float value contained in the lower 32 bits of a vector of
1399/// [4 x float] into a 32-bit integer.
1400///
1401/// If the converted value does not fit in a 32-bit integer, raises a
1402/// floating-point invalid exception. If the exception is masked, returns
1403/// the most negative integer.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1408/// instructions.
1409///
1410/// \param __a
1411/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1412/// used in the conversion.
1413/// \returns A 32-bit integer containing the converted value.
1414static __inline__ int __DEFAULT_FN_ATTRS
1416{
1417 return _mm_cvtss_si32(__a);
1418}
1419
1420#ifdef __x86_64__
1421
1422/// Converts a float value contained in the lower 32 bits of a vector of
1423/// [4 x float] into a 64-bit integer.
1424///
1425/// If the converted value does not fit in a 32-bit integer, raises a
1426/// floating-point invalid exception. If the exception is masked, returns
1427/// the most negative integer.
1428///
1429/// \headerfile <x86intrin.h>
1430///
1431/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1432/// instructions.
1433///
1434/// \param __a
1435/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1436/// used in the conversion.
1437/// \returns A 64-bit integer containing the converted value.
1438static __inline__ long long __DEFAULT_FN_ATTRS
1439_mm_cvtss_si64(__m128 __a)
1440{
1441 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1442}
1443
1444#endif
1445
1446/// Converts two low-order float values in a 128-bit vector of
1447/// [4 x float] into a 64-bit vector of [2 x i32].
1448///
1449/// If a converted value does not fit in a 32-bit integer, raises a
1450/// floating-point invalid exception. If the exception is masked, returns
1451/// the most negative integer.
1452///
1453/// \headerfile <x86intrin.h>
1454///
1455/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1456///
1457/// \param __a
1458/// A 128-bit vector of [4 x float].
1459/// \returns A 64-bit integer vector containing the converted values.
1460static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1462{
1463 return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1464}
1465
1466/// Converts two low-order float values in a 128-bit vector of
1467/// [4 x float] into a 64-bit vector of [2 x i32].
1468///
1469/// If a converted value does not fit in a 32-bit integer, raises a
1470/// floating-point invalid exception. If the exception is masked, returns
1471/// the most negative integer.
1472///
1473/// \headerfile <x86intrin.h>
1474///
1475/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1476///
1477/// \param __a
1478/// A 128-bit vector of [4 x float].
1479/// \returns A 64-bit integer vector containing the converted values.
1480static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1482{
1483 return _mm_cvtps_pi32(__a);
1484}
1485
1486/// Converts the lower (first) element of a vector of [4 x float] into a signed
1487/// truncated (rounded toward zero) 32-bit integer.
1488///
1489/// If the converted value does not fit in a 32-bit integer, raises a
1490/// floating-point invalid exception. If the exception is masked, returns
1491/// the most negative integer.
1492///
1493/// \headerfile <x86intrin.h>
1494///
1495/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1496/// instructions.
1497///
1498/// \param __a
1499/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1500/// used in the conversion.
1501/// \returns A 32-bit integer containing the converted value.
1502static __inline__ int __DEFAULT_FN_ATTRS
1504{
1505 return __builtin_ia32_cvttss2si((__v4sf)__a);
1506}
1507
1508/// Converts the lower (first) element of a vector of [4 x float] into a signed
1509/// truncated (rounded toward zero) 32-bit integer.
1510///
1511/// If the converted value does not fit in a 32-bit integer, raises a
1512/// floating-point invalid exception. If the exception is masked, returns
1513/// the most negative integer.
1514///
1515/// \headerfile <x86intrin.h>
1516///
1517/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1518/// instructions.
1519///
1520/// \param __a
1521/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1522/// used in the conversion.
1523/// \returns A 32-bit integer containing the converted value.
1524static __inline__ int __DEFAULT_FN_ATTRS
1526{
1527 return _mm_cvttss_si32(__a);
1528}
1529
1530#ifdef __x86_64__
1531/// Converts the lower (first) element of a vector of [4 x float] into a signed
1532/// truncated (rounded toward zero) 64-bit integer.
1533///
1534/// If the converted value does not fit in a 64-bit integer, raises a
1535/// floating-point invalid exception. If the exception is masked, returns
1536/// the most negative integer.
1537///
1538/// \headerfile <x86intrin.h>
1539///
1540/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1541/// instructions.
1542///
1543/// \param __a
1544/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1545/// used in the conversion.
1546/// \returns A 64-bit integer containing the converted value.
1547static __inline__ long long __DEFAULT_FN_ATTRS
1548_mm_cvttss_si64(__m128 __a)
1549{
1550 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1551}
1552#endif
1553
1554/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1555/// into two signed truncated (rounded toward zero) 32-bit integers,
1556/// returned in a 64-bit vector of [2 x i32].
1557///
1558/// If a converted value does not fit in a 32-bit integer, raises a
1559/// floating-point invalid exception. If the exception is masked, returns
1560/// the most negative integer.
1561///
1562/// \headerfile <x86intrin.h>
1563///
1564/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1565/// instructions.
1566///
1567/// \param __a
1568/// A 128-bit vector of [4 x float].
1569/// \returns A 64-bit integer vector containing the converted values.
1570static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1572{
1573 return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1574}
1575
1576/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1577/// into two signed truncated (rounded toward zero) 64-bit integers,
1578/// returned in a 64-bit vector of [2 x i32].
1579///
1580/// If a converted value does not fit in a 32-bit integer, raises a
1581/// floating-point invalid exception. If the exception is masked, returns
1582/// the most negative integer.
1583///
1584/// \headerfile <x86intrin.h>
1585///
1586/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1587///
1588/// \param __a
1589/// A 128-bit vector of [4 x float].
1590/// \returns A 64-bit integer vector containing the converted values.
1591static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1593{
1594 return _mm_cvttps_pi32(__a);
1595}
1596
1597/// Converts a 32-bit signed integer value into a floating point value
1598/// and writes it to the lower 32 bits of the destination. The remaining
1599/// higher order elements of the destination vector are copied from the
1600/// corresponding elements in the first operand.
1601///
1602/// \headerfile <x86intrin.h>
1603///
1604/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1605///
1606/// \param __a
1607/// A 128-bit vector of [4 x float].
1608/// \param __b
1609/// A 32-bit signed integer operand containing the value to be converted.
1610/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1611/// converted value of the second operand. The upper 96 bits are copied from
1612/// the upper 96 bits of the first operand.
1613static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1614 int __b) {
1615 __a[0] = __b;
1616 return __a;
1617}
1618
1619/// Converts a 32-bit signed integer value into a floating point value
1620/// and writes it to the lower 32 bits of the destination. The remaining
1621/// higher order elements of the destination are copied from the
1622/// corresponding elements in the first operand.
1623///
1624/// \headerfile <x86intrin.h>
1625///
1626/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1627///
1628/// \param __a
1629/// A 128-bit vector of [4 x float].
1630/// \param __b
1631/// A 32-bit signed integer operand containing the value to be converted.
1632/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1633/// converted value of the second operand. The upper 96 bits are copied from
1634/// the upper 96 bits of the first operand.
1635static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1636 int __b) {
1637 return _mm_cvtsi32_ss(__a, __b);
1638}
1639
1640#ifdef __x86_64__
1641
1642/// Converts a 64-bit signed integer value into a floating point value
1643/// and writes it to the lower 32 bits of the destination. The remaining
1644/// higher order elements of the destination are copied from the
1645/// corresponding elements in the first operand.
1646///
1647/// \headerfile <x86intrin.h>
1648///
1649/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1650///
1651/// \param __a
1652/// A 128-bit vector of [4 x float].
1653/// \param __b
1654/// A 64-bit signed integer operand containing the value to be converted.
1655/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1656/// converted value of the second operand. The upper 96 bits are copied from
1657/// the upper 96 bits of the first operand.
1658static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1659_mm_cvtsi64_ss(__m128 __a, long long __b) {
1660 __a[0] = __b;
1661 return __a;
1662}
1663
1664#endif
1665
1666/// Converts two elements of a 64-bit vector of [2 x i32] into two
1667/// floating point values and writes them to the lower 64-bits of the
1668/// destination. The remaining higher order elements of the destination are
1669/// copied from the corresponding elements in the first operand.
1670///
1671/// \headerfile <x86intrin.h>
1672///
1673/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1674///
1675/// \param __a
1676/// A 128-bit vector of [4 x float].
1677/// \param __b
1678/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1679/// and written to the corresponding low-order elements in the destination.
1680/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1681/// converted value of the second operand. The upper 64 bits are copied from
1682/// the upper 64 bits of the first operand.
1683static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1684_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1685{
1686 return (__m128)__builtin_shufflevector(
1687 (__v4sf)__a,
1688 __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1689 4, 5, 2, 3);
1690}
1691
1692/// Converts two elements of a 64-bit vector of [2 x i32] into two
1693/// floating point values and writes them to the lower 64-bits of the
1694/// destination. The remaining higher order elements of the destination are
1695/// copied from the corresponding elements in the first operand.
1696///
1697/// \headerfile <x86intrin.h>
1698///
1699/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1700///
1701/// \param __a
1702/// A 128-bit vector of [4 x float].
1703/// \param __b
1704/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1705/// and written to the corresponding low-order elements in the destination.
1706/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1707/// converted value from the second operand. The upper 64 bits are copied
1708/// from the upper 64 bits of the first operand.
1709static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1710_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1711{
1712 return _mm_cvtpi32_ps(__a, __b);
1713}
1714
1715/// Extracts a float value contained in the lower 32 bits of a vector of
1716/// [4 x float].
1717///
1718/// \headerfile <x86intrin.h>
1719///
1720/// This intrinsic has no corresponding instruction.
1721///
1722/// \param __a
1723/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1724/// used in the extraction.
1725/// \returns A 32-bit float containing the extracted value.
1726static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1728 return __a[0];
1729}
1730
1731/// Loads two packed float values from the address \a __p into the
1732/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1733/// are copied from the low-order bits of the first operand.
1734///
1735/// \headerfile <x86intrin.h>
1736///
1737/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1738///
1739/// \param __a
1740/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1741/// of the destination.
1742/// \param __p
1743/// A pointer to two packed float values. Bits [63:0] are written to bits
1744/// [127:64] of the destination.
1745/// \returns A 128-bit vector of [4 x float] containing the moved values.
1746static __inline__ __m128 __DEFAULT_FN_ATTRS
1747_mm_loadh_pi(__m128 __a, const __m64 *__p)
1748{
1749 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1750 struct __mm_loadh_pi_struct {
1751 __mm_loadh_pi_v2f32 __u;
1752 } __attribute__((__packed__, __may_alias__));
1753 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1754 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1755 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1756}
1757
1758/// Loads two packed float values from the address \a __p into the
1759/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1760/// are copied from the high-order bits of the first operand.
1761///
1762/// \headerfile <x86intrin.h>
1763///
1764/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1765///
1766/// \param __a
1767/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1768/// [127:64] of the destination.
1769/// \param __p
1770/// A pointer to two packed float values. Bits [63:0] are written to bits
1771/// [63:0] of the destination.
1772/// \returns A 128-bit vector of [4 x float] containing the moved values.
1773static __inline__ __m128 __DEFAULT_FN_ATTRS
1774_mm_loadl_pi(__m128 __a, const __m64 *__p)
1775{
1776 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1777 struct __mm_loadl_pi_struct {
1778 __mm_loadl_pi_v2f32 __u;
1779 } __attribute__((__packed__, __may_alias__));
1780 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1781 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1782 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1783}
1784
1785/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1786/// 32 bits of the vector are initialized with the single-precision
1787/// floating-point value loaded from a specified memory location. The upper
1788/// 96 bits are set to zero.
1789///
1790/// \headerfile <x86intrin.h>
1791///
1792/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1793///
1794/// \param __p
1795/// A pointer to a 32-bit memory location containing a single-precision
1796/// floating-point value.
1797/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1798/// lower 32 bits contain the value loaded from the memory location. The
1799/// upper 96 bits are set to zero.
1800static __inline__ __m128 __DEFAULT_FN_ATTRS
1801_mm_load_ss(const float *__p)
1802{
1803 struct __mm_load_ss_struct {
1804 float __u;
1805 } __attribute__((__packed__, __may_alias__));
1806 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1807 return __extension__ (__m128){ __u, 0, 0, 0 };
1808}
1809
1810/// Loads a 32-bit float value and duplicates it to all four vector
1811/// elements of a 128-bit vector of [4 x float].
1812///
1813/// \headerfile <x86intrin.h>
1814///
1815/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1816/// instruction.
1817///
1818/// \param __p
1819/// A pointer to a float value to be loaded and duplicated.
1820/// \returns A 128-bit vector of [4 x float] containing the loaded and
1821/// duplicated values.
1822static __inline__ __m128 __DEFAULT_FN_ATTRS
1823_mm_load1_ps(const float *__p)
1824{
1825 struct __mm_load1_ps_struct {
1826 float __u;
1827 } __attribute__((__packed__, __may_alias__));
1828 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1829 return __extension__ (__m128){ __u, __u, __u, __u };
1830}
1831
1832#define _mm_load_ps1(p) _mm_load1_ps(p)
1833
1834/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1835/// memory location.
1836///
1837/// \headerfile <x86intrin.h>
1838///
1839/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1840///
1841/// \param __p
1842/// A pointer to a 128-bit memory location. The address of the memory
1843/// location has to be 128-bit aligned.
1844/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1845static __inline__ __m128 __DEFAULT_FN_ATTRS
1846_mm_load_ps(const float *__p)
1847{
1848 return *(const __m128*)__p;
1849}
1850
1851/// Loads a 128-bit floating-point vector of [4 x float] from an
1852/// unaligned memory location.
1853///
1854/// \headerfile <x86intrin.h>
1855///
1856/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1857///
1858/// \param __p
1859/// A pointer to a 128-bit memory location. The address of the memory
1860/// location does not have to be aligned.
1861/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1862static __inline__ __m128 __DEFAULT_FN_ATTRS
1863_mm_loadu_ps(const float *__p)
1864{
1865 struct __loadu_ps {
1866 __m128_u __v;
1867 } __attribute__((__packed__, __may_alias__));
1868 return ((const struct __loadu_ps*)__p)->__v;
1869}
1870
1871/// Loads four packed float values, in reverse order, from an aligned
1872/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1873///
1874/// \headerfile <x86intrin.h>
1875///
1876/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1877/// instruction.
1878///
1879/// \param __p
1880/// A pointer to a 128-bit memory location. The address of the memory
1881/// location has to be 128-bit aligned.
1882/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1883/// in reverse order.
1884static __inline__ __m128 __DEFAULT_FN_ATTRS
1885_mm_loadr_ps(const float *__p)
1886{
1887 __m128 __a = _mm_load_ps(__p);
1888 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1889}
1890
1891/// Create a 128-bit vector of [4 x float] with undefined values.
1892///
1893/// \headerfile <x86intrin.h>
1894///
1895/// This intrinsic has no corresponding instruction.
1896///
1897/// \returns A 128-bit vector of [4 x float] containing undefined values.
1898static __inline__ __m128 __DEFAULT_FN_ATTRS
1900{
1901 return (__m128)__builtin_ia32_undef128();
1902}
1903
1904/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1905/// 32 bits of the vector are initialized with the specified single-precision
1906/// floating-point value. The upper 96 bits are set to zero.
1907///
1908/// \headerfile <x86intrin.h>
1909///
1910/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1911///
1912/// \param __w
1913/// A single-precision floating-point value used to initialize the lower 32
1914/// bits of the result.
1915/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1916/// lower 32 bits contain the value provided in the source operand. The
1917/// upper 96 bits are set to zero.
1918static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1919_mm_set_ss(float __w) {
1920 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1921}
1922
1923/// Constructs a 128-bit floating-point vector of [4 x float], with each
1924/// of the four single-precision floating-point vector elements set to the
1925/// specified single-precision floating-point value.
1926///
1927/// \headerfile <x86intrin.h>
1928///
1929/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1930///
1931/// \param __w
1932/// A single-precision floating-point value used to initialize each vector
1933/// element of the result.
1934/// \returns An initialized 128-bit floating-point vector of [4 x float].
1935static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1936_mm_set1_ps(float __w) {
1937 return __extension__ (__m128){ __w, __w, __w, __w };
1938}
1939
1940/* Microsoft specific. */
1941/// Constructs a 128-bit floating-point vector of [4 x float], with each
1942/// of the four single-precision floating-point vector elements set to the
1943/// specified single-precision floating-point value.
1944///
1945/// \headerfile <x86intrin.h>
1946///
1947/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1948///
1949/// \param __w
1950/// A single-precision floating-point value used to initialize each vector
1951/// element of the result.
1952/// \returns An initialized 128-bit floating-point vector of [4 x float].
1953static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1954_mm_set_ps1(float __w) {
1955 return _mm_set1_ps(__w);
1956}
1957
1958/// Constructs a 128-bit floating-point vector of [4 x float]
1959/// initialized with the specified single-precision floating-point values.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic is a utility function and does not correspond to a specific
1964/// instruction.
1965///
1966/// \param __z
1967/// A single-precision floating-point value used to initialize bits [127:96]
1968/// of the result.
1969/// \param __y
1970/// A single-precision floating-point value used to initialize bits [95:64]
1971/// of the result.
1972/// \param __x
1973/// A single-precision floating-point value used to initialize bits [63:32]
1974/// of the result.
1975/// \param __w
1976/// A single-precision floating-point value used to initialize bits [31:0]
1977/// of the result.
1978/// \returns An initialized 128-bit floating-point vector of [4 x float].
1979static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1980_mm_set_ps(float __z, float __y, float __x, float __w) {
1981 return __extension__ (__m128){ __w, __x, __y, __z };
1982}
1983
1984/// Constructs a 128-bit floating-point vector of [4 x float],
1985/// initialized in reverse order with the specified 32-bit single-precision
1986/// float-point values.
1987///
1988/// \headerfile <x86intrin.h>
1989///
1990/// This intrinsic is a utility function and does not correspond to a specific
1991/// instruction.
1992///
1993/// \param __z
1994/// A single-precision floating-point value used to initialize bits [31:0]
1995/// of the result.
1996/// \param __y
1997/// A single-precision floating-point value used to initialize bits [63:32]
1998/// of the result.
1999/// \param __x
2000/// A single-precision floating-point value used to initialize bits [95:64]
2001/// of the result.
2002/// \param __w
2003/// A single-precision floating-point value used to initialize bits [127:96]
2004/// of the result.
2005/// \returns An initialized 128-bit floating-point vector of [4 x float].
2006static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2007_mm_setr_ps(float __z, float __y, float __x, float __w) {
2008 return __extension__ (__m128){ __z, __y, __x, __w };
2009}
2010
2011/// Constructs a 128-bit floating-point vector of [4 x float] initialized
2012/// to zero.
2013///
2014/// \headerfile <x86intrin.h>
2015///
2016/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2017///
2018/// \returns An initialized 128-bit floating-point vector of [4 x float] with
2019/// all elements set to zero.
2020static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2022 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2023}
2024
2025/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2026/// memory location.
2027///
2028/// \headerfile <x86intrin.h>
2029///
2030/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2031///
2032/// \param __p
2033/// A pointer to a 64-bit memory location.
2034/// \param __a
2035/// A 128-bit vector of [4 x float] containing the values to be stored.
2036static __inline__ void __DEFAULT_FN_ATTRS
2037_mm_storeh_pi(__m64 *__p, __m128 __a)
2038{
2039 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2040 struct __mm_storeh_pi_struct {
2041 __mm_storeh_pi_v2f32 __u;
2042 } __attribute__((__packed__, __may_alias__));
2043 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2044}
2045
2046/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2047/// memory location.
2048///
2049/// \headerfile <x86intrin.h>
2050///
2051/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2052///
2053/// \param __p
2054/// A pointer to a memory location that will receive the float values.
2055/// \param __a
2056/// A 128-bit vector of [4 x float] containing the values to be stored.
2057static __inline__ void __DEFAULT_FN_ATTRS
2058_mm_storel_pi(__m64 *__p, __m128 __a)
2059{
2060 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2061 struct __mm_storeh_pi_struct {
2062 __mm_storeh_pi_v2f32 __u;
2063 } __attribute__((__packed__, __may_alias__));
2064 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2065}
2066
2067/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2068/// memory location.
2069///
2070/// \headerfile <x86intrin.h>
2071///
2072/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2073///
2074/// \param __p
2075/// A pointer to a 32-bit memory location.
2076/// \param __a
2077/// A 128-bit vector of [4 x float] containing the value to be stored.
2078static __inline__ void __DEFAULT_FN_ATTRS
2079_mm_store_ss(float *__p, __m128 __a)
2080{
2081 struct __mm_store_ss_struct {
2082 float __u;
2083 } __attribute__((__packed__, __may_alias__));
2084 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2085}
2086
2087/// Stores a 128-bit vector of [4 x float] to an unaligned memory
2088/// location.
2089///
2090/// \headerfile <x86intrin.h>
2091///
2092/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2093///
2094/// \param __p
2095/// A pointer to a 128-bit memory location. The address of the memory
2096/// location does not have to be aligned.
2097/// \param __a
2098/// A 128-bit vector of [4 x float] containing the values to be stored.
2099static __inline__ void __DEFAULT_FN_ATTRS
2100_mm_storeu_ps(float *__p, __m128 __a)
2101{
2102 struct __storeu_ps {
2103 __m128_u __v;
2104 } __attribute__((__packed__, __may_alias__));
2105 ((struct __storeu_ps*)__p)->__v = __a;
2106}
2107
2108/// Stores a 128-bit vector of [4 x float] into an aligned memory
2109/// location.
2110///
2111/// \headerfile <x86intrin.h>
2112///
2113/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2114///
2115/// \param __p
2116/// A pointer to a 128-bit memory location. The address of the memory
2117/// location has to be 16-byte aligned.
2118/// \param __a
2119/// A 128-bit vector of [4 x float] containing the values to be stored.
2120static __inline__ void __DEFAULT_FN_ATTRS
2121_mm_store_ps(float *__p, __m128 __a)
2122{
2123 *(__m128*)__p = __a;
2124}
2125
2126/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2127/// four contiguous elements in an aligned memory location.
2128///
2129/// \headerfile <x86intrin.h>
2130///
2131/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2132/// instruction.
2133///
2134/// \param __p
2135/// A pointer to a 128-bit memory location.
2136/// \param __a
2137/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2138/// of the four contiguous elements pointed by \a __p.
2139static __inline__ void __DEFAULT_FN_ATTRS
2140_mm_store1_ps(float *__p, __m128 __a)
2141{
2142 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2144}
2145
2146/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2147/// four contiguous elements in an aligned memory location.
2148///
2149/// \headerfile <x86intrin.h>
2150///
2151/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2152/// instruction.
2153///
2154/// \param __p
2155/// A pointer to a 128-bit memory location.
2156/// \param __a
2157/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2158/// of the four contiguous elements pointed by \a __p.
2159static __inline__ void __DEFAULT_FN_ATTRS
2160_mm_store_ps1(float *__p, __m128 __a)
2161{
2163}
2164
2165/// Stores float values from a 128-bit vector of [4 x float] to an
2166/// aligned memory location in reverse order.
2167///
2168/// \headerfile <x86intrin.h>
2169///
2170/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2171/// instruction.
2172///
2173/// \param __p
2174/// A pointer to a 128-bit memory location. The address of the memory
2175/// location has to be 128-bit aligned.
2176/// \param __a
2177/// A 128-bit vector of [4 x float] containing the values to be stored.
2178static __inline__ void __DEFAULT_FN_ATTRS
2179_mm_storer_ps(float *__p, __m128 __a)
2180{
2181 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2183}
2184
2185#define _MM_HINT_ET0 7
2186#define _MM_HINT_ET1 6
2187#define _MM_HINT_T0 3
2188#define _MM_HINT_T1 2
2189#define _MM_HINT_T2 1
2190#define _MM_HINT_NTA 0
2191
2192#ifndef _MSC_VER
2193// If _MSC_VER is defined, we use the builtin variant of _mm_prefetch.
2194// Otherwise, we provide this macro, which includes a cast, allowing the user
2195// to pass a pointer of any time. The _mm_prefetch accepts char to match MSVC.
2196
2197/// Loads one cache line of data from the specified address to a location
2198/// closer to the processor.
2199///
2200/// \headerfile <x86intrin.h>
2201///
2202/// \code
2203/// void _mm_prefetch(const void *a, const int sel);
2204/// \endcode
2205///
2206/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2207///
2208/// \param a
2209/// A pointer to a memory location containing a cache line of data.
2210/// \param sel
2211/// A predefined integer constant specifying the type of prefetch
2212/// operation: \n
2213/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2214/// PREFETCHNTA instruction will be generated. \n
2215/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2216/// be generated. \n
2217/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2218/// be generated. \n
2219/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2220/// be generated.
2221#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2222 ((sel) >> 2) & 1, (sel) & 0x3))
2223#endif
2224
2225/// Stores a 64-bit integer in the specified aligned memory location. To
2226/// minimize caching, the data is flagged as non-temporal (unlikely to be
2227/// used again soon).
2228///
2229/// \headerfile <x86intrin.h>
2230///
2231/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2232///
2233/// \param __p
2234/// A pointer to an aligned memory location used to store the register value.
2235/// \param __a
2236/// A 64-bit integer containing the value to be stored.
2237static __inline__ void __DEFAULT_FN_ATTRS
2238_mm_stream_pi(void *__p, __m64 __a)
2239{
2240 __builtin_nontemporal_store(__a, (__m64 *)__p);
2241}
2242
2243/// Moves packed float values from a 128-bit vector of [4 x float] to a
2244/// 128-bit aligned memory location. To minimize caching, the data is flagged
2245/// as non-temporal (unlikely to be used again soon).
2246///
2247/// \headerfile <x86intrin.h>
2248///
2249/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2250///
2251/// \param __p
2252/// A pointer to a 128-bit aligned memory location that will receive the
2253/// single-precision floating-point values.
2254/// \param __a
2255/// A 128-bit vector of [4 x float] containing the values to be moved.
2256static __inline__ void __DEFAULT_FN_ATTRS
2257_mm_stream_ps(void *__p, __m128 __a)
2258{
2259 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2260}
2261
2262#if defined(__cplusplus)
2263extern "C" {
2264#endif
2265
2266/// Forces strong memory ordering (serialization) between store
2267/// instructions preceding this instruction and store instructions following
2268/// this instruction, ensuring the system completes all previous stores
2269/// before executing subsequent stores.
2270///
2271/// \headerfile <x86intrin.h>
2272///
2273/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2274///
2275void _mm_sfence(void);
2276
2277#if defined(__cplusplus)
2278} // extern "C"
2279#endif
2280
2281/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2282/// returns it, as specified by the immediate integer operand.
2283///
2284/// \headerfile <x86intrin.h>
2285///
2286/// \code
2287/// int _mm_extract_pi16(__m64 a, int n);
2288/// \endcode
2289///
2290/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2291///
2292/// \param a
2293/// A 64-bit vector of [4 x i16].
2294/// \param n
2295/// An immediate integer operand that determines which bits are extracted: \n
2296/// 0: Bits [15:0] are copied to the destination. \n
2297/// 1: Bits [31:16] are copied to the destination. \n
2298/// 2: Bits [47:32] are copied to the destination. \n
2299/// 3: Bits [63:48] are copied to the destination.
2300/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2301#define _mm_extract_pi16(a, n) \
2302 ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2303
2304/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2305/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2306/// specified by the immediate operand \a n.
2307///
2308/// \headerfile <x86intrin.h>
2309///
2310/// \code
2311/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2312/// \endcode
2313///
2314/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2315///
2316/// \param a
2317/// A 64-bit vector of [4 x i16].
2318/// \param d
2319/// An integer. The lower 16-bit value from this operand is written to the
2320/// destination at the offset specified by operand \a n.
2321/// \param n
2322/// An immediate integer operant that determines which the bits to be used
2323/// in the destination. \n
2324/// 0: Bits [15:0] are copied to the destination. \n
2325/// 1: Bits [31:16] are copied to the destination. \n
2326/// 2: Bits [47:32] are copied to the destination. \n
2327/// 3: Bits [63:48] are copied to the destination. \n
2328/// The remaining bits in the destination are copied from the corresponding
2329/// bits in operand \a a.
2330/// \returns A 64-bit integer vector containing the copied packed data from the
2331/// operands.
2332#define _mm_insert_pi16(a, d, n) \
2333 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2334
2335/// Compares each of the corresponding packed 16-bit integer values of
2336/// the 64-bit integer vectors, and writes the greater value to the
2337/// corresponding bits in the destination.
2338///
2339/// \headerfile <x86intrin.h>
2340///
2341/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2342///
2343/// \param __a
2344/// A 64-bit integer vector containing one of the source operands.
2345/// \param __b
2346/// A 64-bit integer vector containing one of the source operands.
2347/// \returns A 64-bit integer vector containing the comparison results.
2348static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2349_mm_max_pi16(__m64 __a, __m64 __b) {
2350 return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2351}
2352
2353/// Compares each of the corresponding packed 8-bit unsigned integer
2354/// values of the 64-bit integer vectors, and writes the greater value to the
2355/// corresponding bits in the destination.
2356///
2357/// \headerfile <x86intrin.h>
2358///
2359/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2360///
2361/// \param __a
2362/// A 64-bit integer vector containing one of the source operands.
2363/// \param __b
2364/// A 64-bit integer vector containing one of the source operands.
2365/// \returns A 64-bit integer vector containing the comparison results.
2366static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2367_mm_max_pu8(__m64 __a, __m64 __b)
2368{
2369 return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2370}
2371
2372/// Compares each of the corresponding packed 16-bit integer values of
2373/// the 64-bit integer vectors, and writes the lesser value to the
2374/// corresponding bits in the destination.
2375///
2376/// \headerfile <x86intrin.h>
2377///
2378/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2379///
2380/// \param __a
2381/// A 64-bit integer vector containing one of the source operands.
2382/// \param __b
2383/// A 64-bit integer vector containing one of the source operands.
2384/// \returns A 64-bit integer vector containing the comparison results.
2385static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2386_mm_min_pi16(__m64 __a, __m64 __b) {
2387 return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2388}
2389
2390/// Compares each of the corresponding packed 8-bit unsigned integer
2391/// values of the 64-bit integer vectors, and writes the lesser value to the
2392/// corresponding bits in the destination.
2393///
2394/// \headerfile <x86intrin.h>
2395///
2396/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2397///
2398/// \param __a
2399/// A 64-bit integer vector containing one of the source operands.
2400/// \param __b
2401/// A 64-bit integer vector containing one of the source operands.
2402/// \returns A 64-bit integer vector containing the comparison results.
2403static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2404_mm_min_pu8(__m64 __a, __m64 __b)
2405{
2406 return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2407}
2408
2409/// Takes the most significant bit from each 8-bit element in a 64-bit
2410/// integer vector to create an 8-bit mask value. Zero-extends the value to
2411/// 32-bit integer and writes it to the destination.
2412///
2413/// \headerfile <x86intrin.h>
2414///
2415/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2416///
2417/// \param __a
2418/// A 64-bit integer vector containing the values with bits to be extracted.
2419/// \returns The most significant bit from each 8-bit element in \a __a,
2420/// written to bits [7:0].
2421static __inline__ int __DEFAULT_FN_ATTRS_SSE2
2423{
2424 return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2425}
2426
2427/// Multiplies packed 16-bit unsigned integer values and writes the
2428/// high-order 16 bits of each 32-bit product to the corresponding bits in
2429/// the destination.
2430///
2431/// \headerfile <x86intrin.h>
2432///
2433/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2434///
2435/// \param __a
2436/// A 64-bit integer vector containing one of the source operands.
2437/// \param __b
2438/// A 64-bit integer vector containing one of the source operands.
2439/// \returns A 64-bit integer vector containing the products of both operands.
2440static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2442{
2443 return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a),
2444 (__v8hu)__zext128(__b)));
2445}
2446
2447/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2448/// destination, as specified by the immediate value operand.
2449///
2450/// \headerfile <x86intrin.h>
2451///
2452/// \code
2453/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2454/// \endcode
2455///
2456/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2457///
2458/// \param a
2459/// A 64-bit integer vector containing the values to be shuffled.
2460/// \param n
2461/// An immediate value containing an 8-bit value specifying which elements to
2462/// copy from \a a. The destinations within the 64-bit destination are
2463/// assigned values as follows: \n
2464/// Bits [1:0] are used to assign values to bits [15:0] in the
2465/// destination. \n
2466/// Bits [3:2] are used to assign values to bits [31:16] in the
2467/// destination. \n
2468/// Bits [5:4] are used to assign values to bits [47:32] in the
2469/// destination. \n
2470/// Bits [7:6] are used to assign values to bits [63:48] in the
2471/// destination. \n
2472/// Bit value assignments: \n
2473/// 00: assigned from bits [15:0] of \a a. \n
2474/// 01: assigned from bits [31:16] of \a a. \n
2475/// 10: assigned from bits [47:32] of \a a. \n
2476/// 11: assigned from bits [63:48] of \a a. \n
2477/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2478/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2479/// <c>[b6, b4, b2, b0]</c>.
2480/// \returns A 64-bit integer vector containing the shuffled values.
2481#define _mm_shuffle_pi16(a, n) \
2482 ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2483 (n) & 0x3, ((n) >> 2) & 0x3, \
2484 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2485
2486/// Conditionally copies the values from each 8-bit element in the first
2487/// 64-bit integer vector operand to the specified memory location, as
2488/// specified by the most significant bit in the corresponding element in the
2489/// second 64-bit integer vector operand.
2490///
2491/// To minimize caching, the data is flagged as non-temporal
2492/// (unlikely to be used again soon).
2493///
2494/// \headerfile <x86intrin.h>
2495///
2496/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2497///
2498/// \param __d
2499/// A 64-bit integer vector containing the values with elements to be copied.
2500/// \param __n
2501/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2502/// element determines whether the corresponding element in operand \a __d
2503/// is copied. If the most significant bit of a given element is 1, the
2504/// corresponding element in operand \a __d is copied.
2505/// \param __p
2506/// A pointer to a 64-bit memory location that will receive the conditionally
2507/// copied integer values. The address of the memory location does not have
2508/// to be aligned.
2509static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2510_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2511{
2512 // This is complex, because we need to support the case where __p is pointing
2513 // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2514 // write might cause a trap where a 64-bit maskmovq would not. (Memory
2515 // locations not selected by the mask bits might still cause traps.)
2516 __m128i __d128 = __anyext128(__d);
2517 __m128i __n128 = __zext128(__n);
2518 if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2519 ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2520 // If there's a risk of spurious trap due to a 128-bit write, back up the
2521 // pointer by 8 bytes and shift values in registers to match.
2522 __p -= 8;
2523 __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
2524 __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
2525 }
2526
2527 __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2528}
2529
2530/// Computes the rounded averages of the packed unsigned 8-bit integer
2531/// values and writes the averages to the corresponding bits in the
2532/// destination.
2533///
2534/// \headerfile <x86intrin.h>
2535///
2536/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2537///
2538/// \param __a
2539/// A 64-bit integer vector containing one of the source operands.
2540/// \param __b
2541/// A 64-bit integer vector containing one of the source operands.
2542/// \returns A 64-bit integer vector containing the averages of both operands.
2543static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2544_mm_avg_pu8(__m64 __a, __m64 __b) {
2545 return __trunc64(__builtin_ia32_pavgb128((__v16qu)__zext128(__a),
2546 (__v16qu)__zext128(__b)));
2547}
2548
2549/// Computes the rounded averages of the packed unsigned 16-bit integer
2550/// values and writes the averages to the corresponding bits in the
2551/// destination.
2552///
2553/// \headerfile <x86intrin.h>
2554///
2555/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2556///
2557/// \param __a
2558/// A 64-bit integer vector containing one of the source operands.
2559/// \param __b
2560/// A 64-bit integer vector containing one of the source operands.
2561/// \returns A 64-bit integer vector containing the averages of both operands.
2562static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2563_mm_avg_pu16(__m64 __a, __m64 __b) {
2564 return __trunc64(
2565 __builtin_ia32_pavgw128((__v8hu)__zext128(__a), (__v8hu)__zext128(__b)));
2566}
2567
2568/// Subtracts the corresponding 8-bit unsigned integer values of the two
2569/// 64-bit vector operands and computes the absolute value for each of the
2570/// difference. Then sum of the 8 absolute differences is written to the
2571/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2572///
2573/// \headerfile <x86intrin.h>
2574///
2575/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2576///
2577/// \param __a
2578/// A 64-bit integer vector containing one of the source operands.
2579/// \param __b
2580/// A 64-bit integer vector containing one of the source operands.
2581/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2582/// sets of absolute differences between both operands. The upper bits are
2583/// cleared.
2584static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2585_mm_sad_pu8(__m64 __a, __m64 __b)
2586{
2587 return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2588 (__v16qi)__zext128(__b)));
2589}
2590
2591#if defined(__cplusplus)
2592extern "C" {
2593#endif
2594
2595/// Returns the contents of the MXCSR register as a 32-bit unsigned
2596/// integer value.
2597///
2598/// There are several groups of macros associated with this
2599/// intrinsic, including:
2600/// <ul>
2601/// <li>
2602/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2603/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2604/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2605/// _MM_GET_EXCEPTION_STATE().
2606/// </li>
2607/// <li>
2608/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2609/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2610/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2611/// </li>
2612/// <li>
2613/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2614/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2615/// _MM_GET_ROUNDING_MODE().
2616/// </li>
2617/// <li>
2618/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2619/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2620/// </li>
2621/// <li>
2622/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2623/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2624/// _MM_GET_DENORMALS_ZERO_MODE().
2625/// </li>
2626/// </ul>
2627///
2628/// For example, the following expression checks if an overflow exception has
2629/// occurred:
2630/// \code
2631/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2632/// \endcode
2633///
2634/// The following expression gets the current rounding mode:
2635/// \code
2636/// _MM_GET_ROUNDING_MODE()
2637/// \endcode
2638///
2639/// \headerfile <x86intrin.h>
2640///
2641/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2642///
2643/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2644/// register.
2645unsigned int _mm_getcsr(void);
2646
2647/// Sets the MXCSR register with the 32-bit unsigned integer value.
2648///
2649/// There are several groups of macros associated with this intrinsic,
2650/// including:
2651/// <ul>
2652/// <li>
2653/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2654/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2655/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2656/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2657/// </li>
2658/// <li>
2659/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2660/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2661/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2662/// of these macros.
2663/// </li>
2664/// <li>
2665/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2666/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2667/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2668/// </li>
2669/// <li>
2670/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2671/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2672/// one of these macros.
2673/// </li>
2674/// <li>
2675/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2676/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2677/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2678/// </li>
2679/// </ul>
2680///
2681/// For example, the following expression causes subsequent floating-point
2682/// operations to round up:
2683/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2684///
2685/// The following example sets the DAZ and FTZ flags:
2686/// \code
2687/// void setFlags() {
2688/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2689/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2690/// }
2691/// \endcode
2692///
2693/// \headerfile <x86intrin.h>
2694///
2695/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2696///
2697/// \param __i
2698/// A 32-bit unsigned integer value to be written to the MXCSR register.
2699void _mm_setcsr(unsigned int __i);
2700
2701#if defined(__cplusplus)
2702} // extern "C"
2703#endif
2704
2705/// Selects 4 float values from the 128-bit operands of [4 x float], as
2706/// specified by the immediate value operand.
2707///
2708/// \headerfile <x86intrin.h>
2709///
2710/// \code
2711/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2712/// \endcode
2713///
2714/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2715///
2716/// \param a
2717/// A 128-bit vector of [4 x float].
2718/// \param b
2719/// A 128-bit vector of [4 x float].
2720/// \param mask
2721/// An immediate value containing an 8-bit value specifying which elements to
2722/// copy from \a a and \a b. \n
2723/// Bits [3:0] specify the values copied from operand \a a. \n
2724/// Bits [7:4] specify the values copied from operand \a b. \n
2725/// The destinations within the 128-bit destination are assigned values as
2726/// follows: \n
2727/// Bits [1:0] are used to assign values to bits [31:0] in the
2728/// destination. \n
2729/// Bits [3:2] are used to assign values to bits [63:32] in the
2730/// destination. \n
2731/// Bits [5:4] are used to assign values to bits [95:64] in the
2732/// destination. \n
2733/// Bits [7:6] are used to assign values to bits [127:96] in the
2734/// destination. \n
2735/// Bit value assignments: \n
2736/// 00: Bits [31:0] copied from the specified operand. \n
2737/// 01: Bits [63:32] copied from the specified operand. \n
2738/// 10: Bits [95:64] copied from the specified operand. \n
2739/// 11: Bits [127:96] copied from the specified operand. \n
2740/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2741/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2742/// <c>[b6, b4, b2, b0]</c>.
2743/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2744#define _mm_shuffle_ps(a, b, mask) \
2745 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2746 (int)(mask)))
2747
2748/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2749/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2750///
2751/// \headerfile <x86intrin.h>
2752///
2753/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2754///
2755/// \param __a
2756/// A 128-bit vector of [4 x float]. \n
2757/// Bits [95:64] are written to bits [31:0] of the destination. \n
2758/// Bits [127:96] are written to bits [95:64] of the destination.
2759/// \param __b
2760/// A 128-bit vector of [4 x float].
2761/// Bits [95:64] are written to bits [63:32] of the destination. \n
2762/// Bits [127:96] are written to bits [127:96] of the destination.
2763/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2764static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2765_mm_unpackhi_ps(__m128 __a, __m128 __b) {
2766 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2767}
2768
2769/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2770/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2771///
2772/// \headerfile <x86intrin.h>
2773///
2774/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2775///
2776/// \param __a
2777/// A 128-bit vector of [4 x float]. \n
2778/// Bits [31:0] are written to bits [31:0] of the destination. \n
2779/// Bits [63:32] are written to bits [95:64] of the destination.
2780/// \param __b
2781/// A 128-bit vector of [4 x float]. \n
2782/// Bits [31:0] are written to bits [63:32] of the destination. \n
2783/// Bits [63:32] are written to bits [127:96] of the destination.
2784/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2785static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2786_mm_unpacklo_ps(__m128 __a, __m128 __b) {
2787 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2788}
2789
2790/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2791/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2792/// 96 bits are set to the upper 96 bits of the first parameter.
2793///
2794/// \headerfile <x86intrin.h>
2795///
2796/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2797/// instruction.
2798///
2799/// \param __a
2800/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2801/// written to the upper 96 bits of the result.
2802/// \param __b
2803/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2804/// written to the lower 32 bits of the result.
2805/// \returns A 128-bit floating-point vector of [4 x float].
2806static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2807_mm_move_ss(__m128 __a, __m128 __b) {
2808 __a[0] = __b[0];
2809 return __a;
2810}
2811
2812/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2813/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2814/// 64 bits are set to the upper 64 bits of the first parameter.
2815///
2816/// \headerfile <x86intrin.h>
2817///
2818/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2819///
2820/// \param __a
2821/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2822/// written to the upper 64 bits of the result.
2823/// \param __b
2824/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2825/// written to the lower 64 bits of the result.
2826/// \returns A 128-bit floating-point vector of [4 x float].
2827static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2828_mm_movehl_ps(__m128 __a, __m128 __b) {
2829 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2830}
2831
2832/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2833/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2834/// 64 bits are set to the lower 64 bits of the second parameter.
2835///
2836/// \headerfile <x86intrin.h>
2837///
2838/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2839///
2840/// \param __a
2841/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2842/// written to the lower 64 bits of the result.
2843/// \param __b
2844/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2845/// written to the upper 64 bits of the result.
2846/// \returns A 128-bit floating-point vector of [4 x float].
2847static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2848_mm_movelh_ps(__m128 __a, __m128 __b) {
2849 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2850}
2851
2852/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2853/// float].
2854///
2855/// \headerfile <x86intrin.h>
2856///
2857/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2858///
2859/// \param __a
2860/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2861/// from the corresponding elements in this operand.
2862/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2863/// values from the operand.
2864static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2866{
2867 return __builtin_convertvector((__v4hi)__a, __v4sf);
2868}
2869
2870/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2871/// 128-bit vector of [4 x float].
2872///
2873/// \headerfile <x86intrin.h>
2874///
2875/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2876///
2877/// \param __a
2878/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2879/// destination are copied from the corresponding elements in this operand.
2880/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2881/// values from the operand.
2882static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2884{
2885 return __builtin_convertvector((__v4hu)__a, __v4sf);
2886}
2887
2888/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2889/// into a 128-bit vector of [4 x float].
2890///
2891/// \headerfile <x86intrin.h>
2892///
2893/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2894///
2895/// \param __a
2896/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2897/// from the corresponding lower 4 elements in this operand.
2898/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2899/// values from the operand.
2900static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2902{
2903 return __builtin_convertvector(
2904 __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2905 0, 1, 2, 3), __v4sf);
2906}
2907
2908/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2909/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2914///
2915/// \param __a
2916/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2917/// destination are copied from the corresponding lower 4 elements in this
2918/// operand.
2919/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2920/// values from the source operand.
2921static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2923{
2924 return __builtin_convertvector(
2925 __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2926 0, 1, 2, 3), __v4sf);
2927}
2928
2929/// Converts the two 32-bit signed integer values from each 64-bit vector
2930/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2931///
2932/// \headerfile <x86intrin.h>
2933///
2934/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2935///
2936/// \param __a
2937/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2938/// copied from the elements in this operand.
2939/// \param __b
2940/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2941/// copied from the elements in this operand.
2942/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2943/// copied and converted values from the first operand. The upper 64 bits
2944/// contain the copied and converted values from the second operand.
2945static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2947{
2948 return __builtin_convertvector(
2949 __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2950 0, 1, 2, 3), __v4sf);
2951}
2952
2953/// Converts each single-precision floating-point element of a 128-bit
2954/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2955/// packs the results into a 64-bit integer vector of [4 x i16].
2956///
2957/// If the floating-point element is NaN or infinity, or if the
2958/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2959/// it is converted to 0x8000. Otherwise if the floating-point element is
2960/// greater than 0x7FFF, it is converted to 0x7FFF.
2961///
2962/// \headerfile <x86intrin.h>
2963///
2964/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2965///
2966/// \param __a
2967/// A 128-bit floating-point vector of [4 x float].
2968/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2969/// values.
2970static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2972{
2973 return __trunc64(__builtin_ia32_packssdw128(
2974 (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2975}
2976
2977/// Converts each single-precision floating-point element of a 128-bit
2978/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2979/// packs the results into the lower 32 bits of a 64-bit integer vector of
2980/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2981///
2982/// If the floating-point element is NaN or infinity, or if the
2983/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2984/// is converted to 0x80. Otherwise if the floating-point element is greater
2985/// than 0x7F, it is converted to 0x7F.
2986///
2987/// \headerfile <x86intrin.h>
2988///
2989/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2990///
2991/// \param __a
2992/// 128-bit floating-point vector of [4 x float].
2993/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2994/// converted values and the uppper 32 bits are set to zero.
2995static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2997{
2998 __m64 __b, __c;
2999
3002
3003 return _mm_packs_pi16(__b, __c);
3004}
3005
3006/// Extracts the sign bits from each single-precision floating-point
3007/// element of a 128-bit floating-point vector of [4 x float] and returns the
3008/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3009/// to zero.
3010///
3011/// \headerfile <x86intrin.h>
3012///
3013/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3014///
3015/// \param __a
3016/// A 128-bit floating-point vector of [4 x float].
3017/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3018/// single-precision floating-point element of the parameter. Bits [31:4] are
3019/// set to zero.
3020static __inline__ int __DEFAULT_FN_ATTRS
3022{
3023 return __builtin_ia32_movmskps((__v4sf)__a);
3024}
3025
3026/* Compare */
3027#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3028#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3029#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3030#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3031#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3032#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3033#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3034#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3035
3036/// Compares each of the corresponding values of two 128-bit vectors of
3037/// [4 x float], using the operation specified by the immediate integer
3038/// operand.
3039///
3040/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3041/// If either value in a comparison is NaN, comparisons that are ordered
3042/// return false, and comparisons that are unordered return true.
3043///
3044/// \headerfile <x86intrin.h>
3045///
3046/// \code
3047/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3048/// \endcode
3049///
3050/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3051///
3052/// \param a
3053/// A 128-bit vector of [4 x float].
3054/// \param b
3055/// A 128-bit vector of [4 x float].
3056/// \param c
3057/// An immediate integer operand, with bits [4:0] specifying which comparison
3058/// operation to use: \n
3059/// 0x00: Equal (ordered, non-signaling) \n
3060/// 0x01: Less-than (ordered, signaling) \n
3061/// 0x02: Less-than-or-equal (ordered, signaling) \n
3062/// 0x03: Unordered (non-signaling) \n
3063/// 0x04: Not-equal (unordered, non-signaling) \n
3064/// 0x05: Not-less-than (unordered, signaling) \n
3065/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3066/// 0x07: Ordered (non-signaling) \n
3067/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3068#define _mm_cmp_ps(a, b, c) \
3069 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3070
3071/// Compares each of the corresponding scalar values of two 128-bit
3072/// vectors of [4 x float], using the operation specified by the immediate
3073/// integer operand.
3074///
3075/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3076/// If either value in a comparison is NaN, comparisons that are ordered
3077/// return false, and comparisons that are unordered return true.
3078///
3079/// \headerfile <x86intrin.h>
3080///
3081/// \code
3082/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3083/// \endcode
3084///
3085/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3086///
3087/// \param a
3088/// A 128-bit vector of [4 x float].
3089/// \param b
3090/// A 128-bit vector of [4 x float].
3091/// \param c
3092/// An immediate integer operand, with bits [4:0] specifying which comparison
3093/// operation to use: \n
3094/// 0x00: Equal (ordered, non-signaling) \n
3095/// 0x01: Less-than (ordered, signaling) \n
3096/// 0x02: Less-than-or-equal (ordered, signaling) \n
3097/// 0x03: Unordered (non-signaling) \n
3098/// 0x04: Not-equal (unordered, non-signaling) \n
3099/// 0x05: Not-less-than (unordered, signaling) \n
3100/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3101/// 0x07: Ordered (non-signaling) \n
3102/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3103#define _mm_cmp_ss(a, b, c) \
3104 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3105
3106#define _MM_ALIGN16 __attribute__((aligned(16)))
3107
3108#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3109
3110#define _MM_EXCEPT_INVALID (0x0001U)
3111#define _MM_EXCEPT_DENORM (0x0002U)
3112#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3113#define _MM_EXCEPT_OVERFLOW (0x0008U)
3114#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3115#define _MM_EXCEPT_INEXACT (0x0020U)
3116#define _MM_EXCEPT_MASK (0x003fU)
3117
3118#define _MM_MASK_INVALID (0x0080U)
3119#define _MM_MASK_DENORM (0x0100U)
3120#define _MM_MASK_DIV_ZERO (0x0200U)
3121#define _MM_MASK_OVERFLOW (0x0400U)
3122#define _MM_MASK_UNDERFLOW (0x0800U)
3123#define _MM_MASK_INEXACT (0x1000U)
3124#define _MM_MASK_MASK (0x1f80U)
3125
3126#define _MM_ROUND_NEAREST (0x0000U)
3127#define _MM_ROUND_DOWN (0x2000U)
3128#define _MM_ROUND_UP (0x4000U)
3129#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3130#define _MM_ROUND_MASK (0x6000U)
3131
3132#define _MM_FLUSH_ZERO_MASK (0x8000U)
3133#define _MM_FLUSH_ZERO_ON (0x8000U)
3134#define _MM_FLUSH_ZERO_OFF (0x0000U)
3135
3136#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3137#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3138#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3139#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3140
3141#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3142#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3143#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3144#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3145
3146#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3147do { \
3148 __m128 tmp3, tmp2, tmp1, tmp0; \
3149 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3150 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3151 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3152 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3153 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3154 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3155 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3156 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3157} while (0)
3158
3159/* Aliases for compatibility. */
3160#define _m_pextrw _mm_extract_pi16
3161#define _m_pinsrw _mm_insert_pi16
3162#define _m_pmaxsw _mm_max_pi16
3163#define _m_pmaxub _mm_max_pu8
3164#define _m_pminsw _mm_min_pi16
3165#define _m_pminub _mm_min_pu8
3166#define _m_pmovmskb _mm_movemask_pi8
3167#define _m_pmulhuw _mm_mulhi_pu16
3168#define _m_pshufw _mm_shuffle_pi16
3169#define _m_maskmovq _mm_maskmove_si64
3170#define _m_pavgb _mm_avg_pu8
3171#define _m_pavgw _mm_avg_pu16
3172#define _m_psadbw _mm_sad_pu8
3173#define _m_ _mm_
3174
3175#undef __trunc64
3176#undef __zext128
3177#undef __anyext128
3178#undef __zeroupper64
3179#undef __DEFAULT_FN_ATTRS
3180#undef __DEFAULT_FN_ATTRS_CONSTEXPR
3181#undef __DEFAULT_FN_ATTRS_SSE2
3182#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3183
3184/* Ugly hack for backwards-compatibility (compatible with gcc) */
3185#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3186#include <emmintrin.h>
3187#endif
3188
3189#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
static __inline__ uint32_t uint32_t __y
Definition arm_acle.h:125
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition mmintrin.h:160
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
Definition mmintrin.h:49
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition mmintrin.h:1307
#define __DEFAULT_FN_ATTRS_SSE2
Definition mmintrin.h:42
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1181
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1503
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2563
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:270
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2807
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2404
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:582
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1710
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:235
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:957
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:534
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2865
#define __anyext128(x)
Definition xmmintrin.h:56
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:560
static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2422
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2544
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1481
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1415
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1684
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:513
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2441
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:160
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1846
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:778
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1157
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2946
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2179
#define __zeroupper64(x)
Definition xmmintrin.h:59
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1899
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:855
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1980
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1276
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:119
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1613
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2786
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:79
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1727
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:608
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2585
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:98
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:323
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:489
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2058
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1954
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1300
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:730
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:1108
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2140
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1635
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1936
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2765
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:218
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:415
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2386
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:306
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1348
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:454
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1133
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1774
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2828
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1823
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:369
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2257
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2238
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1205
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1393
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:680
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2349
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1324
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2971
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2848
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2079
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:905
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1747
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2901
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:287
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2121
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:471
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:252
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition xmmintrin.h:756
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2922
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2883
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1571
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1461
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1525
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:3021
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2996
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:433
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1885
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1009
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:828
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2037
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:930
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:982
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1033
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:656
#define __trunc64(x)
Definition xmmintrin.h:51
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1252
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2510
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:805
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2160
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:2007
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:348
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:630
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:139
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:394
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:200
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1371
#define __zext128(x)
Definition xmmintrin.h:53
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1229
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1592
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1919
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:878
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1060
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:706
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2367
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1084
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1801
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:179