clang 23.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef float __v4sf __attribute__((__vector_size__(16)));
20typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
21
22typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
23
24/* Unsigned types */
25typedef unsigned int __v4su __attribute__((__vector_size__(16)));
26typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
27typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
28
29/* This header should only be included in a hosted environment as it depends on
30 * a standard library to provide allocation routines. */
31#if __STDC_HOSTED__
32#include <mm_malloc.h>
33#endif
34
35/* Define the default attributes for the functions in this file. */
36#define __DEFAULT_FN_ATTRS \
37 __attribute__((__always_inline__, __nodebug__, __target__("sse"), \
38 __min_vector_width__(128)))
39#define __DEFAULT_FN_ATTRS_SSE2 \
40 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
41 __min_vector_width__(128)))
42
43#if defined(__cplusplus) && (__cplusplus >= 201103L)
44#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
45#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
46#else
47#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
48#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
49#endif
50
51#define __trunc64(x) \
52 (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
53#define __zext128(x) \
54 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
55 1, 2, 3)
56#define __anyext128(x) \
57 (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
58 1, -1, -1)
59#define __zeroupper64(x) \
60 (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
61 1, 4, 5)
62
63/// Adds the 32-bit float values in the low-order bits of the operands.
64///
65/// \headerfile <x86intrin.h>
66///
67/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
68///
69/// \param __a
70/// A 128-bit vector of [4 x float] containing one of the source operands.
71/// The lower 32 bits of this operand are used in the calculation.
72/// \param __b
73/// A 128-bit vector of [4 x float] containing one of the source operands.
74/// The lower 32 bits of this operand are used in the calculation.
75/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
76/// of the lower 32 bits of both operands. The upper 96 bits are copied from
77/// the upper 96 bits of the first source operand.
78static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
79_mm_add_ss(__m128 __a, __m128 __b) {
80 __a[0] += __b[0];
81 return __a;
82}
83
84/// Adds two 128-bit vectors of [4 x float], and returns the results of
85/// the addition.
86///
87/// \headerfile <x86intrin.h>
88///
89/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
90///
91/// \param __a
92/// A 128-bit vector of [4 x float] containing one of the source operands.
93/// \param __b
94/// A 128-bit vector of [4 x float] containing one of the source operands.
95/// \returns A 128-bit vector of [4 x float] containing the sums of both
96/// operands.
97static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
98_mm_add_ps(__m128 __a, __m128 __b) {
99 return (__m128)((__v4sf)__a + (__v4sf)__b);
100}
101
102/// Subtracts the 32-bit float value in the low-order bits of the second
103/// operand from the corresponding value in the first operand.
104///
105/// \headerfile <x86intrin.h>
106///
107/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
108///
109/// \param __a
110/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
111/// of this operand are used in the calculation.
112/// \param __b
113/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
114/// bits of this operand are used in the calculation.
115/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
116/// difference of the lower 32 bits of both operands. The upper 96 bits are
117/// copied from the upper 96 bits of the first source operand.
118static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
119_mm_sub_ss(__m128 __a, __m128 __b) {
120 __a[0] -= __b[0];
121 return __a;
122}
123
124/// Subtracts each of the values of the second operand from the first
125/// operand, both of which are 128-bit vectors of [4 x float] and returns
126/// the results of the subtraction.
127///
128/// \headerfile <x86intrin.h>
129///
130/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
131///
132/// \param __a
133/// A 128-bit vector of [4 x float] containing the minuend.
134/// \param __b
135/// A 128-bit vector of [4 x float] containing the subtrahend.
136/// \returns A 128-bit vector of [4 x float] containing the differences between
137/// both operands.
138static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
139_mm_sub_ps(__m128 __a, __m128 __b) {
140 return (__m128)((__v4sf)__a - (__v4sf)__b);
141}
142
143/// Multiplies two 32-bit float values in the low-order bits of the
144/// operands.
145///
146/// \headerfile <x86intrin.h>
147///
148/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
149///
150/// \param __a
151/// A 128-bit vector of [4 x float] containing one of the source operands.
152/// The lower 32 bits of this operand are used in the calculation.
153/// \param __b
154/// A 128-bit vector of [4 x float] containing one of the source operands.
155/// The lower 32 bits of this operand are used in the calculation.
156/// \returns A 128-bit vector of [4 x float] containing the product of the lower
157/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
158/// bits of the first source operand.
159static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
160_mm_mul_ss(__m128 __a, __m128 __b) {
161 __a[0] *= __b[0];
162 return __a;
163}
164
165/// Multiplies two 128-bit vectors of [4 x float] and returns the
166/// results of the multiplication.
167///
168/// \headerfile <x86intrin.h>
169///
170/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
171///
172/// \param __a
173/// A 128-bit vector of [4 x float] containing one of the source operands.
174/// \param __b
175/// A 128-bit vector of [4 x float] containing one of the source operands.
176/// \returns A 128-bit vector of [4 x float] containing the products of both
177/// operands.
178static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
179_mm_mul_ps(__m128 __a, __m128 __b) {
180 return (__m128)((__v4sf)__a * (__v4sf)__b);
181}
182
183/// Divides the value in the low-order 32 bits of the first operand by
184/// the corresponding value in the second operand.
185///
186/// \headerfile <x86intrin.h>
187///
188/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
189///
190/// \param __a
191/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
192/// bits of this operand are used in the calculation.
193/// \param __b
194/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
195/// of this operand are used in the calculation.
196/// \returns A 128-bit vector of [4 x float] containing the quotients of the
197/// lower 32 bits of both operands. The upper 96 bits are copied from the
198/// upper 96 bits of the first source operand.
199static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
200_mm_div_ss(__m128 __a, __m128 __b) {
201 __a[0] /= __b[0];
202 return __a;
203}
204
205/// Divides two 128-bit vectors of [4 x float].
206///
207/// \headerfile <x86intrin.h>
208///
209/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
210///
211/// \param __a
212/// A 128-bit vector of [4 x float] containing the dividend.
213/// \param __b
214/// A 128-bit vector of [4 x float] containing the divisor.
215/// \returns A 128-bit vector of [4 x float] containing the quotients of both
216/// operands.
217static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
218_mm_div_ps(__m128 __a, __m128 __b) {
219 return (__m128)((__v4sf)__a / (__v4sf)__b);
220}
221
222/// Calculates the square root of the value stored in the low-order bits
223/// of a 128-bit vector of [4 x float].
224///
225/// \headerfile <x86intrin.h>
226///
227/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
228///
229/// \param __a
230/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
231/// used in the calculation.
232/// \returns A 128-bit vector of [4 x float] containing the square root of the
233/// value in the low-order bits of the operand.
234static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
235 __a[0] = __builtin_elementwise_sqrt(__a[0]);
236 return __a;
237}
238
239/// Calculates the square roots of the values stored in a 128-bit vector
240/// of [4 x float].
241///
242/// \headerfile <x86intrin.h>
243///
244/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
245///
246/// \param __a
247/// A 128-bit vector of [4 x float].
248/// \returns A 128-bit vector of [4 x float] containing the square roots of the
249/// values in the operand.
250static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
251 return __builtin_elementwise_sqrt(__a);
252}
253
254/// Calculates the approximate reciprocal of the value stored in the
255/// low-order bits of a 128-bit vector of [4 x float].
256///
257/// \headerfile <x86intrin.h>
258///
259/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
260///
261/// \param __a
262/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
263/// used in the calculation.
264/// \returns A 128-bit vector of [4 x float] containing the approximate
265/// reciprocal of the value in the low-order bits of the operand.
266static __inline__ __m128 __DEFAULT_FN_ATTRS
268{
269 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
270}
271
272/// Calculates the approximate reciprocals of the values stored in a
273/// 128-bit vector of [4 x float].
274///
275/// \headerfile <x86intrin.h>
276///
277/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
278///
279/// \param __a
280/// A 128-bit vector of [4 x float].
281/// \returns A 128-bit vector of [4 x float] containing the approximate
282/// reciprocals of the values in the operand.
283static __inline__ __m128 __DEFAULT_FN_ATTRS
285{
286 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
287}
288
289/// Calculates the approximate reciprocal of the square root of the value
290/// stored in the low-order bits of a 128-bit vector of [4 x float].
291///
292/// \headerfile <x86intrin.h>
293///
294/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
295///
296/// \param __a
297/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
298/// used in the calculation.
299/// \returns A 128-bit vector of [4 x float] containing the approximate
300/// reciprocal of the square root of the value in the low-order bits of the
301/// operand.
302static __inline__ __m128 __DEFAULT_FN_ATTRS
304{
305 return __builtin_ia32_rsqrtss((__v4sf)__a);
306}
307
308/// Calculates the approximate reciprocals of the square roots of the
309/// values stored in a 128-bit vector of [4 x float].
310///
311/// \headerfile <x86intrin.h>
312///
313/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
314///
315/// \param __a
316/// A 128-bit vector of [4 x float].
317/// \returns A 128-bit vector of [4 x float] containing the approximate
318/// reciprocals of the square roots of the values in the operand.
319static __inline__ __m128 __DEFAULT_FN_ATTRS
321{
322 return __builtin_ia32_rsqrtps((__v4sf)__a);
323}
324
325/// Compares two 32-bit float values in the low-order bits of both
326/// operands and returns the lesser value in the low-order bits of the
327/// vector of [4 x float].
328///
329/// If either value in a comparison is NaN, returns the value from \a __b.
330///
331/// \headerfile <x86intrin.h>
332///
333/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
334///
335/// \param __a
336/// A 128-bit vector of [4 x float] containing one of the operands. The lower
337/// 32 bits of this operand are used in the comparison.
338/// \param __b
339/// A 128-bit vector of [4 x float] containing one of the operands. The lower
340/// 32 bits of this operand are used in the comparison.
341/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
342/// minimum value between both operands. The upper 96 bits are copied from
343/// the upper 96 bits of the first source operand.
344static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b) {
345 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
346}
347
348/// Compares two 128-bit vectors of [4 x float] and returns the lesser
349/// of each pair of values.
350///
351/// If either value in a comparison is NaN, returns the value from \a __b.
352///
353/// \headerfile <x86intrin.h>
354///
355/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
356///
357/// \param __a
358/// A 128-bit vector of [4 x float] containing one of the operands.
359/// \param __b
360/// A 128-bit vector of [4 x float] containing one of the operands.
361/// \returns A 128-bit vector of [4 x float] containing the minimum values
362/// between both operands.
363static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_ps(__m128 __a,
364 __m128 __b) {
365 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
366}
367
368/// Compares two 32-bit float values in the low-order bits of both
369/// operands and returns the greater value in the low-order bits of a 128-bit
370/// vector of [4 x float].
371///
372/// If either value in a comparison is NaN, returns the value from \a __b.
373///
374/// \headerfile <x86intrin.h>
375///
376/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
377///
378/// \param __a
379/// A 128-bit vector of [4 x float] containing one of the operands. The lower
380/// 32 bits of this operand are used in the comparison.
381/// \param __b
382/// A 128-bit vector of [4 x float] containing one of the operands. The lower
383/// 32 bits of this operand are used in the comparison.
384/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
385/// maximum value between both operands. The upper 96 bits are copied from
386/// the upper 96 bits of the first source operand.
387static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b) {
388 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
389}
390
391/// Compares two 128-bit vectors of [4 x float] and returns the greater
392/// of each pair of values.
393///
394/// If either value in a comparison is NaN, returns the value from \a __b.
395///
396/// \headerfile <x86intrin.h>
397///
398/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
399///
400/// \param __a
401/// A 128-bit vector of [4 x float] containing one of the operands.
402/// \param __b
403/// A 128-bit vector of [4 x float] containing one of the operands.
404/// \returns A 128-bit vector of [4 x float] containing the maximum values
405/// between both operands.
406static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_ps(__m128 __a,
407 __m128 __b) {
408 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
409}
410
411/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
412///
413/// \headerfile <x86intrin.h>
414///
415/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
416///
417/// \param __a
418/// A 128-bit vector containing one of the source operands.
419/// \param __b
420/// A 128-bit vector containing one of the source operands.
421/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
422/// values between both operands.
423static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
424_mm_and_ps(__m128 __a, __m128 __b) {
425 return (__m128)((__v4su)__a & (__v4su)__b);
426}
427
428/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
429/// the one's complement of the values contained in the first source
430/// operand.
431///
432/// \headerfile <x86intrin.h>
433///
434/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
435///
436/// \param __a
437/// A 128-bit vector of [4 x float] containing the first source operand. The
438/// one's complement of this value is used in the bitwise AND.
439/// \param __b
440/// A 128-bit vector of [4 x float] containing the second source operand.
441/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
442/// one's complement of the first operand and the values in the second
443/// operand.
444static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
445_mm_andnot_ps(__m128 __a, __m128 __b) {
446 return (__m128)(~(__v4su)__a & (__v4su)__b);
447}
448
449/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
450///
451/// \headerfile <x86intrin.h>
452///
453/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
454///
455/// \param __a
456/// A 128-bit vector of [4 x float] containing one of the source operands.
457/// \param __b
458/// A 128-bit vector of [4 x float] containing one of the source operands.
459/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
460/// values between both operands.
461static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
462_mm_or_ps(__m128 __a, __m128 __b) {
463 return (__m128)((__v4su)__a | (__v4su)__b);
464}
465
466/// Performs a bitwise exclusive OR of two 128-bit vectors of
467/// [4 x float].
468///
469/// \headerfile <x86intrin.h>
470///
471/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
472///
473/// \param __a
474/// A 128-bit vector of [4 x float] containing one of the source operands.
475/// \param __b
476/// A 128-bit vector of [4 x float] containing one of the source operands.
477/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
478/// of the values between both operands.
479static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
480_mm_xor_ps(__m128 __a, __m128 __b) {
481 return (__m128)((__v4su)__a ^ (__v4su)__b);
482}
483
484/// Compares two 32-bit float values in the low-order bits of both
485/// operands for equality.
486///
487/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
488/// low-order bits of a vector [4 x float].
489/// If either value in a comparison is NaN, returns false.
490///
491/// \headerfile <x86intrin.h>
492///
493/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
494///
495/// \param __a
496/// A 128-bit vector of [4 x float] containing one of the operands. The lower
497/// 32 bits of this operand are used in the comparison.
498/// \param __b
499/// A 128-bit vector of [4 x float] containing one of the operands. The lower
500/// 32 bits of this operand are used in the comparison.
501/// \returns A 128-bit vector of [4 x float] containing the comparison results
502/// in the low-order bits.
503static __inline__ __m128 __DEFAULT_FN_ATTRS
504_mm_cmpeq_ss(__m128 __a, __m128 __b)
505{
506 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
507}
508
509/// Compares each of the corresponding 32-bit float values of the
510/// 128-bit vectors of [4 x float] for equality.
511///
512/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
513/// If either value in a comparison is NaN, returns false.
514///
515/// \headerfile <x86intrin.h>
516///
517/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
518///
519/// \param __a
520/// A 128-bit vector of [4 x float].
521/// \param __b
522/// A 128-bit vector of [4 x float].
523/// \returns A 128-bit vector of [4 x float] containing the comparison results.
524static __inline__ __m128 __DEFAULT_FN_ATTRS
525_mm_cmpeq_ps(__m128 __a, __m128 __b)
526{
527 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
528}
529
530/// Compares two 32-bit float values in the low-order bits of both
531/// operands to determine if the value in the first operand is less than the
532/// corresponding value in the second operand.
533///
534/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
535/// low-order bits of a vector of [4 x float].
536/// If either value in a comparison is NaN, returns false.
537///
538/// \headerfile <x86intrin.h>
539///
540/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
541///
542/// \param __a
543/// A 128-bit vector of [4 x float] containing one of the operands. The lower
544/// 32 bits of this operand are used in the comparison.
545/// \param __b
546/// A 128-bit vector of [4 x float] containing one of the operands. The lower
547/// 32 bits of this operand are used in the comparison.
548/// \returns A 128-bit vector of [4 x float] containing the comparison results
549/// in the low-order bits.
550static __inline__ __m128 __DEFAULT_FN_ATTRS
551_mm_cmplt_ss(__m128 __a, __m128 __b)
552{
553 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
554}
555
556/// Compares each of the corresponding 32-bit float values of the
557/// 128-bit vectors of [4 x float] to determine if the values in the first
558/// operand are less than those in the second operand.
559///
560/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561/// If either value in a comparison is NaN, returns false.
562///
563/// \headerfile <x86intrin.h>
564///
565/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
566///
567/// \param __a
568/// A 128-bit vector of [4 x float].
569/// \param __b
570/// A 128-bit vector of [4 x float].
571/// \returns A 128-bit vector of [4 x float] containing the comparison results.
572static __inline__ __m128 __DEFAULT_FN_ATTRS
573_mm_cmplt_ps(__m128 __a, __m128 __b)
574{
575 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
576}
577
578/// Compares two 32-bit float values in the low-order bits of both
579/// operands to determine if the value in the first operand is less than or
580/// equal to the corresponding value in the second operand.
581///
582/// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
583/// the low-order bits of a vector of [4 x float].
584/// If either value in a comparison is NaN, returns false.
585///
586/// \headerfile <x86intrin.h>
587///
588/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
589///
590/// \param __a
591/// A 128-bit vector of [4 x float] containing one of the operands. The lower
592/// 32 bits of this operand are used in the comparison.
593/// \param __b
594/// A 128-bit vector of [4 x float] containing one of the operands. The lower
595/// 32 bits of this operand are used in the comparison.
596/// \returns A 128-bit vector of [4 x float] containing the comparison results
597/// in the low-order bits.
598static __inline__ __m128 __DEFAULT_FN_ATTRS
599_mm_cmple_ss(__m128 __a, __m128 __b)
600{
601 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
602}
603
604/// Compares each of the corresponding 32-bit float values of the
605/// 128-bit vectors of [4 x float] to determine if the values in the first
606/// operand are less than or equal to those in the second operand.
607///
608/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
609/// If either value in a comparison is NaN, returns false.
610///
611/// \headerfile <x86intrin.h>
612///
613/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
614///
615/// \param __a
616/// A 128-bit vector of [4 x float].
617/// \param __b
618/// A 128-bit vector of [4 x float].
619/// \returns A 128-bit vector of [4 x float] containing the comparison results.
620static __inline__ __m128 __DEFAULT_FN_ATTRS
621_mm_cmple_ps(__m128 __a, __m128 __b)
622{
623 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
624}
625
626/// Compares two 32-bit float values in the low-order bits of both
627/// operands to determine if the value in the first operand is greater than
628/// the corresponding value in the second operand.
629///
630/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
631/// low-order bits of a vector of [4 x float].
632/// If either value in a comparison is NaN, returns false.
633///
634/// \headerfile <x86intrin.h>
635///
636/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
637///
638/// \param __a
639/// A 128-bit vector of [4 x float] containing one of the operands. The lower
640/// 32 bits of this operand are used in the comparison.
641/// \param __b
642/// A 128-bit vector of [4 x float] containing one of the operands. The lower
643/// 32 bits of this operand are used in the comparison.
644/// \returns A 128-bit vector of [4 x float] containing the comparison results
645/// in the low-order bits.
646static __inline__ __m128 __DEFAULT_FN_ATTRS
647_mm_cmpgt_ss(__m128 __a, __m128 __b)
648{
649 return (__m128)__builtin_shufflevector((__v4sf)__a,
650 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
651 4, 1, 2, 3);
652}
653
654/// Compares each of the corresponding 32-bit float values of the
655/// 128-bit vectors of [4 x float] to determine if the values in the first
656/// operand are greater than those in the second operand.
657///
658/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
659/// If either value in a comparison is NaN, returns false.
660///
661/// \headerfile <x86intrin.h>
662///
663/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
664///
665/// \param __a
666/// A 128-bit vector of [4 x float].
667/// \param __b
668/// A 128-bit vector of [4 x float].
669/// \returns A 128-bit vector of [4 x float] containing the comparison results.
670static __inline__ __m128 __DEFAULT_FN_ATTRS
671_mm_cmpgt_ps(__m128 __a, __m128 __b)
672{
673 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
674}
675
676/// Compares two 32-bit float values in the low-order bits of both
677/// operands to determine if the value in the first operand is greater than
678/// or equal to the corresponding value in the second operand.
679///
680/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
681/// low-order bits of a vector of [4 x float].
682/// If either value in a comparison is NaN, returns false.
683///
684/// \headerfile <x86intrin.h>
685///
686/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
687///
688/// \param __a
689/// A 128-bit vector of [4 x float] containing one of the operands. The lower
690/// 32 bits of this operand are used in the comparison.
691/// \param __b
692/// A 128-bit vector of [4 x float] containing one of the operands. The lower
693/// 32 bits of this operand are used in the comparison.
694/// \returns A 128-bit vector of [4 x float] containing the comparison results
695/// in the low-order bits.
696static __inline__ __m128 __DEFAULT_FN_ATTRS
697_mm_cmpge_ss(__m128 __a, __m128 __b)
698{
699 return (__m128)__builtin_shufflevector((__v4sf)__a,
700 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
701 4, 1, 2, 3);
702}
703
704/// Compares each of the corresponding 32-bit float values of the
705/// 128-bit vectors of [4 x float] to determine if the values in the first
706/// operand are greater than or equal to those in the second operand.
707///
708/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709/// If either value in a comparison is NaN, returns false.
710///
711/// \headerfile <x86intrin.h>
712///
713/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
714///
715/// \param __a
716/// A 128-bit vector of [4 x float].
717/// \param __b
718/// A 128-bit vector of [4 x float].
719/// \returns A 128-bit vector of [4 x float] containing the comparison results.
720static __inline__ __m128 __DEFAULT_FN_ATTRS
721_mm_cmpge_ps(__m128 __a, __m128 __b)
722{
723 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
724}
725
726/// Compares two 32-bit float values in the low-order bits of both operands
727/// for inequality.
728///
729/// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
730/// low-order bits of a vector of [4 x float].
731/// If either value in a comparison is NaN, returns true.
732///
733/// \headerfile <x86intrin.h>
734///
735/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
736/// instructions.
737///
738/// \param __a
739/// A 128-bit vector of [4 x float] containing one of the operands. The lower
740/// 32 bits of this operand are used in the comparison.
741/// \param __b
742/// A 128-bit vector of [4 x float] containing one of the operands. The lower
743/// 32 bits of this operand are used in the comparison.
744/// \returns A 128-bit vector of [4 x float] containing the comparison results
745/// in the low-order bits.
746static __inline__ __m128 __DEFAULT_FN_ATTRS
747_mm_cmpneq_ss(__m128 __a, __m128 __b)
748{
749 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
750}
751
752/// Compares each of the corresponding 32-bit float values of the
753/// 128-bit vectors of [4 x float] for inequality.
754///
755/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
756/// If either value in a comparison is NaN, returns true.
757///
758/// \headerfile <x86intrin.h>
759///
760/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
761/// instructions.
762///
763/// \param __a
764/// A 128-bit vector of [4 x float].
765/// \param __b
766/// A 128-bit vector of [4 x float].
767/// \returns A 128-bit vector of [4 x float] containing the comparison results.
768static __inline__ __m128 __DEFAULT_FN_ATTRS
769_mm_cmpneq_ps(__m128 __a, __m128 __b)
770{
771 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
772}
773
774/// Compares two 32-bit float values in the low-order bits of both
775/// operands to determine if the value in the first operand is not less than
776/// the corresponding value in the second operand.
777///
778/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
779/// low-order bits of a vector of [4 x float].
780/// If either value in a comparison is NaN, returns true.
781///
782/// \headerfile <x86intrin.h>
783///
784/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
785/// instructions.
786///
787/// \param __a
788/// A 128-bit vector of [4 x float] containing one of the operands. The lower
789/// 32 bits of this operand are used in the comparison.
790/// \param __b
791/// A 128-bit vector of [4 x float] containing one of the operands. The lower
792/// 32 bits of this operand are used in the comparison.
793/// \returns A 128-bit vector of [4 x float] containing the comparison results
794/// in the low-order bits.
795static __inline__ __m128 __DEFAULT_FN_ATTRS
796_mm_cmpnlt_ss(__m128 __a, __m128 __b)
797{
798 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
799}
800
801/// Compares each of the corresponding 32-bit float values of the
802/// 128-bit vectors of [4 x float] to determine if the values in the first
803/// operand are not less than those in the second operand.
804///
805/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
806/// If either value in a comparison is NaN, returns true.
807///
808/// \headerfile <x86intrin.h>
809///
810/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
811/// instructions.
812///
813/// \param __a
814/// A 128-bit vector of [4 x float].
815/// \param __b
816/// A 128-bit vector of [4 x float].
817/// \returns A 128-bit vector of [4 x float] containing the comparison results.
818static __inline__ __m128 __DEFAULT_FN_ATTRS
819_mm_cmpnlt_ps(__m128 __a, __m128 __b)
820{
821 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
822}
823
824/// Compares two 32-bit float values in the low-order bits of both
825/// operands to determine if the value in the first operand is not less than
826/// or equal to the corresponding value in the second operand.
827///
828/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
829/// low-order bits of a vector of [4 x float].
830/// If either value in a comparison is NaN, returns true.
831///
832/// \headerfile <x86intrin.h>
833///
834/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
835/// instructions.
836///
837/// \param __a
838/// A 128-bit vector of [4 x float] containing one of the operands. The lower
839/// 32 bits of this operand are used in the comparison.
840/// \param __b
841/// A 128-bit vector of [4 x float] containing one of the operands. The lower
842/// 32 bits of this operand are used in the comparison.
843/// \returns A 128-bit vector of [4 x float] containing the comparison results
844/// in the low-order bits.
845static __inline__ __m128 __DEFAULT_FN_ATTRS
846_mm_cmpnle_ss(__m128 __a, __m128 __b)
847{
848 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
849}
850
851/// Compares each of the corresponding 32-bit float values of the
852/// 128-bit vectors of [4 x float] to determine if the values in the first
853/// operand are not less than or equal to those in the second operand.
854///
855/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
856/// If either value in a comparison is NaN, returns true.
857///
858/// \headerfile <x86intrin.h>
859///
860/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
861/// instructions.
862///
863/// \param __a
864/// A 128-bit vector of [4 x float].
865/// \param __b
866/// A 128-bit vector of [4 x float].
867/// \returns A 128-bit vector of [4 x float] containing the comparison results.
868static __inline__ __m128 __DEFAULT_FN_ATTRS
869_mm_cmpnle_ps(__m128 __a, __m128 __b)
870{
871 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
872}
873
874/// Compares two 32-bit float values in the low-order bits of both
875/// operands to determine if the value in the first operand is not greater
876/// than the corresponding value in the second operand.
877///
878/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
879/// low-order bits of a vector of [4 x float].
880/// If either value in a comparison is NaN, returns true.
881///
882/// \headerfile <x86intrin.h>
883///
884/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
885/// instructions.
886///
887/// \param __a
888/// A 128-bit vector of [4 x float] containing one of the operands. The lower
889/// 32 bits of this operand are used in the comparison.
890/// \param __b
891/// A 128-bit vector of [4 x float] containing one of the operands. The lower
892/// 32 bits of this operand are used in the comparison.
893/// \returns A 128-bit vector of [4 x float] containing the comparison results
894/// in the low-order bits.
895static __inline__ __m128 __DEFAULT_FN_ATTRS
896_mm_cmpngt_ss(__m128 __a, __m128 __b)
897{
898 return (__m128)__builtin_shufflevector((__v4sf)__a,
899 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
900 4, 1, 2, 3);
901}
902
903/// Compares each of the corresponding 32-bit float values of the
904/// 128-bit vectors of [4 x float] to determine if the values in the first
905/// operand are not greater than those in the second operand.
906///
907/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
908/// If either value in a comparison is NaN, returns true.
909///
910/// \headerfile <x86intrin.h>
911///
912/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
913/// instructions.
914///
915/// \param __a
916/// A 128-bit vector of [4 x float].
917/// \param __b
918/// A 128-bit vector of [4 x float].
919/// \returns A 128-bit vector of [4 x float] containing the comparison results.
920static __inline__ __m128 __DEFAULT_FN_ATTRS
921_mm_cmpngt_ps(__m128 __a, __m128 __b)
922{
923 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
924}
925
926/// Compares two 32-bit float values in the low-order bits of both
927/// operands to determine if the value in the first operand is not greater
928/// than or equal to the corresponding value in the second operand.
929///
930/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
931/// low-order bits of a vector of [4 x float].
932/// If either value in a comparison is NaN, returns true.
933///
934/// \headerfile <x86intrin.h>
935///
936/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
937/// instructions.
938///
939/// \param __a
940/// A 128-bit vector of [4 x float] containing one of the operands. The lower
941/// 32 bits of this operand are used in the comparison.
942/// \param __b
943/// A 128-bit vector of [4 x float] containing one of the operands. The lower
944/// 32 bits of this operand are used in the comparison.
945/// \returns A 128-bit vector of [4 x float] containing the comparison results
946/// in the low-order bits.
947static __inline__ __m128 __DEFAULT_FN_ATTRS
948_mm_cmpnge_ss(__m128 __a, __m128 __b)
949{
950 return (__m128)__builtin_shufflevector((__v4sf)__a,
951 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
952 4, 1, 2, 3);
953}
954
955/// Compares each of the corresponding 32-bit float values of the
956/// 128-bit vectors of [4 x float] to determine if the values in the first
957/// operand are not greater than or equal to those in the second operand.
958///
959/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
960/// If either value in a comparison is NaN, returns true.
961///
962/// \headerfile <x86intrin.h>
963///
964/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
965/// instructions.
966///
967/// \param __a
968/// A 128-bit vector of [4 x float].
969/// \param __b
970/// A 128-bit vector of [4 x float].
971/// \returns A 128-bit vector of [4 x float] containing the comparison results.
972static __inline__ __m128 __DEFAULT_FN_ATTRS
973_mm_cmpnge_ps(__m128 __a, __m128 __b)
974{
975 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
976}
977
978/// Compares two 32-bit float values in the low-order bits of both
979/// operands to determine if the value in the first operand is ordered with
980/// respect to the corresponding value in the second operand.
981///
982/// A pair of floating-point values are ordered with respect to each
983/// other if neither value is a NaN. Each comparison returns 0x0 for false,
984/// 0xFFFFFFFF for true.
985///
986/// \headerfile <x86intrin.h>
987///
988/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
989/// instructions.
990///
991/// \param __a
992/// A 128-bit vector of [4 x float] containing one of the operands. The lower
993/// 32 bits of this operand are used in the comparison.
994/// \param __b
995/// A 128-bit vector of [4 x float] containing one of the operands. The lower
996/// 32 bits of this operand are used in the comparison.
997/// \returns A 128-bit vector of [4 x float] containing the comparison results
998/// in the low-order bits.
999static __inline__ __m128 __DEFAULT_FN_ATTRS
1000_mm_cmpord_ss(__m128 __a, __m128 __b)
1001{
1002 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1003}
1004
1005/// Compares each of the corresponding 32-bit float values of the
1006/// 128-bit vectors of [4 x float] to determine if the values in the first
1007/// operand are ordered with respect to those in the second operand.
1008///
1009/// A pair of floating-point values are ordered with respect to each
1010/// other if neither value is a NaN. Each comparison returns 0x0 for false,
1011/// 0xFFFFFFFF for true.
1012///
1013/// \headerfile <x86intrin.h>
1014///
1015/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1016/// instructions.
1017///
1018/// \param __a
1019/// A 128-bit vector of [4 x float].
1020/// \param __b
1021/// A 128-bit vector of [4 x float].
1022/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1023static __inline__ __m128 __DEFAULT_FN_ATTRS
1024_mm_cmpord_ps(__m128 __a, __m128 __b)
1025{
1026 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1027}
1028
1029/// Compares two 32-bit float values in the low-order bits of both
1030/// operands to determine if the value in the first operand is unordered
1031/// with respect to the corresponding value in the second operand.
1032///
1033/// A pair of double-precision values are unordered with respect to each
1034/// other if one or both values are NaN. Each comparison returns 0x0 for
1035/// false, 0xFFFFFFFF for true.
1036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1040/// instructions.
1041///
1042/// \param __a
1043/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1044/// 32 bits of this operand are used in the comparison.
1045/// \param __b
1046/// A 128-bit vector of [4 x float] containing one of the operands. The lower
1047/// 32 bits of this operand are used in the comparison.
1048/// \returns A 128-bit vector of [4 x float] containing the comparison results
1049/// in the low-order bits.
1050static __inline__ __m128 __DEFAULT_FN_ATTRS
1051_mm_cmpunord_ss(__m128 __a, __m128 __b)
1052{
1053 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1054}
1055
1056/// Compares each of the corresponding 32-bit float values of the
1057/// 128-bit vectors of [4 x float] to determine if the values in the first
1058/// operand are unordered with respect to those in the second operand.
1059///
1060/// A pair of double-precision values are unordered with respect to each
1061/// other if one or both values are NaN. Each comparison returns 0x0 for
1062/// false, 0xFFFFFFFFFFFFFFFF for true.
1063///
1064/// \headerfile <x86intrin.h>
1065///
1066/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1067/// instructions.
1068///
1069/// \param __a
1070/// A 128-bit vector of [4 x float].
1071/// \param __b
1072/// A 128-bit vector of [4 x float].
1073/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1074static __inline__ __m128 __DEFAULT_FN_ATTRS
1075_mm_cmpunord_ps(__m128 __a, __m128 __b)
1076{
1077 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1078}
1079
1080/// Compares two 32-bit float values in the low-order bits of both
1081/// operands for equality.
1082///
1083/// The comparison returns 0 for false, 1 for true. If either value in a
1084/// comparison is NaN, returns 0.
1085///
1086/// \headerfile <x86intrin.h>
1087///
1088/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1089/// instructions.
1090///
1091/// \param __a
1092/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093/// used in the comparison.
1094/// \param __b
1095/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1096/// used in the comparison.
1097/// \returns An integer containing the comparison results.
1098static __inline__ int __DEFAULT_FN_ATTRS
1099_mm_comieq_ss(__m128 __a, __m128 __b)
1100{
1101 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1102}
1103
1104/// Compares two 32-bit float values in the low-order bits of both
1105/// operands to determine if the first operand is less than the second
1106/// operand.
1107///
1108/// The comparison returns 0 for false, 1 for true. If either value in a
1109/// comparison is NaN, returns 0.
1110///
1111/// \headerfile <x86intrin.h>
1112///
1113/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1114/// instructions.
1115///
1116/// \param __a
1117/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1118/// used in the comparison.
1119/// \param __b
1120/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1121/// used in the comparison.
1122/// \returns An integer containing the comparison results.
1123static __inline__ int __DEFAULT_FN_ATTRS
1124_mm_comilt_ss(__m128 __a, __m128 __b)
1125{
1126 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1127}
1128
1129/// Compares two 32-bit float values in the low-order bits of both
1130/// operands to determine if the first operand is less than or equal to the
1131/// second operand.
1132///
1133/// The comparison returns 0 for false, 1 for true. If either value in a
1134/// comparison is NaN, returns 0.
1135///
1136/// \headerfile <x86intrin.h>
1137///
1138/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1139///
1140/// \param __a
1141/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142/// used in the comparison.
1143/// \param __b
1144/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1145/// used in the comparison.
1146/// \returns An integer containing the comparison results.
1147static __inline__ int __DEFAULT_FN_ATTRS
1148_mm_comile_ss(__m128 __a, __m128 __b)
1149{
1150 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1151}
1152
1153/// Compares two 32-bit float values in the low-order bits of both
1154/// operands to determine if the first operand is greater than the second
1155/// operand.
1156///
1157/// The comparison returns 0 for false, 1 for true. If either value in a
1158/// comparison is NaN, returns 0.
1159///
1160/// \headerfile <x86intrin.h>
1161///
1162/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1163///
1164/// \param __a
1165/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166/// used in the comparison.
1167/// \param __b
1168/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169/// used in the comparison.
1170/// \returns An integer containing the comparison results.
1171static __inline__ int __DEFAULT_FN_ATTRS
1172_mm_comigt_ss(__m128 __a, __m128 __b)
1173{
1174 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1175}
1176
1177/// Compares two 32-bit float values in the low-order bits of both
1178/// operands to determine if the first operand is greater than or equal to
1179/// the second operand.
1180///
1181/// The comparison returns 0 for false, 1 for true. If either value in a
1182/// comparison is NaN, returns 0.
1183///
1184/// \headerfile <x86intrin.h>
1185///
1186/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1187///
1188/// \param __a
1189/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190/// used in the comparison.
1191/// \param __b
1192/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1193/// used in the comparison.
1194/// \returns An integer containing the comparison results.
1195static __inline__ int __DEFAULT_FN_ATTRS
1196_mm_comige_ss(__m128 __a, __m128 __b)
1197{
1198 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1199}
1200
1201/// Compares two 32-bit float values in the low-order bits of both
1202/// operands to determine if the first operand is not equal to the second
1203/// operand.
1204///
1205/// The comparison returns 0 for false, 1 for true. If either value in a
1206/// comparison is NaN, returns 1.
1207///
1208/// \headerfile <x86intrin.h>
1209///
1210/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1211///
1212/// \param __a
1213/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214/// used in the comparison.
1215/// \param __b
1216/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1217/// used in the comparison.
1218/// \returns An integer containing the comparison results.
1219static __inline__ int __DEFAULT_FN_ATTRS
1220_mm_comineq_ss(__m128 __a, __m128 __b)
1221{
1222 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1223}
1224
1225/// Performs an unordered comparison of two 32-bit float values using
1226/// the low-order bits of both operands to determine equality.
1227///
1228/// The comparison returns 0 for false, 1 for true. If either value in a
1229/// comparison is NaN, returns 0.
1230///
1231/// \headerfile <x86intrin.h>
1232///
1233/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1234///
1235/// \param __a
1236/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1237/// used in the comparison.
1238/// \param __b
1239/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240/// used in the comparison.
1241/// \returns An integer containing the comparison results.
1242static __inline__ int __DEFAULT_FN_ATTRS
1243_mm_ucomieq_ss(__m128 __a, __m128 __b)
1244{
1245 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1246}
1247
1248/// Performs an unordered comparison of two 32-bit float values using
1249/// the low-order bits of both operands to determine if the first operand is
1250/// less than the second operand.
1251///
1252/// The comparison returns 0 for false, 1 for true. If either value in a
1253/// comparison is NaN, returns 0.
1254///
1255/// \headerfile <x86intrin.h>
1256///
1257/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258///
1259/// \param __a
1260/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261/// used in the comparison.
1262/// \param __b
1263/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264/// used in the comparison.
1265/// \returns An integer containing the comparison results.
1266static __inline__ int __DEFAULT_FN_ATTRS
1267_mm_ucomilt_ss(__m128 __a, __m128 __b)
1268{
1269 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1270}
1271
1272/// Performs an unordered comparison of two 32-bit float values using
1273/// the low-order bits of both operands to determine if the first operand is
1274/// less than or equal to the second operand.
1275///
1276/// The comparison returns 0 for false, 1 for true. If either value in a
1277/// comparison is NaN, returns 0.
1278///
1279/// \headerfile <x86intrin.h>
1280///
1281/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282///
1283/// \param __a
1284/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285/// used in the comparison.
1286/// \param __b
1287/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288/// used in the comparison.
1289/// \returns An integer containing the comparison results.
1290static __inline__ int __DEFAULT_FN_ATTRS
1291_mm_ucomile_ss(__m128 __a, __m128 __b)
1292{
1293 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1294}
1295
1296/// Performs an unordered comparison of two 32-bit float values using
1297/// the low-order bits of both operands to determine if the first operand is
1298/// greater than the second operand.
1299///
1300/// The comparison returns 0 for false, 1 for true. If either value in a
1301/// comparison is NaN, returns 0.
1302///
1303/// \headerfile <x86intrin.h>
1304///
1305/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1306///
1307/// \param __a
1308/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309/// used in the comparison.
1310/// \param __b
1311/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1312/// used in the comparison.
1313/// \returns An integer containing the comparison results.
1314static __inline__ int __DEFAULT_FN_ATTRS
1315_mm_ucomigt_ss(__m128 __a, __m128 __b)
1316{
1317 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1318}
1319
1320/// Performs an unordered comparison of two 32-bit float values using
1321/// the low-order bits of both operands to determine if the first operand is
1322/// greater than or equal to the second operand.
1323///
1324/// The comparison returns 0 for false, 1 for true. If either value in a
1325/// comparison is NaN, returns 0.
1326///
1327/// \headerfile <x86intrin.h>
1328///
1329/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1330///
1331/// \param __a
1332/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333/// used in the comparison.
1334/// \param __b
1335/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1336/// used in the comparison.
1337/// \returns An integer containing the comparison results.
1338static __inline__ int __DEFAULT_FN_ATTRS
1339_mm_ucomige_ss(__m128 __a, __m128 __b)
1340{
1341 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1342}
1343
1344/// Performs an unordered comparison of two 32-bit float values using
1345/// the low-order bits of both operands to determine inequality.
1346///
1347/// The comparison returns 0 for false, 1 for true. If either value in a
1348/// comparison is NaN, returns 0.
1349///
1350/// \headerfile <x86intrin.h>
1351///
1352/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1353///
1354/// \param __a
1355/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1356/// used in the comparison.
1357/// \param __b
1358/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1359/// used in the comparison.
1360/// \returns An integer containing the comparison results.
1361static __inline__ int __DEFAULT_FN_ATTRS
1362_mm_ucomineq_ss(__m128 __a, __m128 __b)
1363{
1364 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1365}
1366
1367/// Converts a float value contained in the lower 32 bits of a vector of
1368/// [4 x float] into a 32-bit integer.
1369///
1370/// If the converted value does not fit in a 32-bit integer, raises a
1371/// floating-point invalid exception. If the exception is masked, returns
1372/// the most negative integer.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1377/// instructions.
1378///
1379/// \param __a
1380/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1381/// used in the conversion.
1382/// \returns A 32-bit integer containing the converted value.
1383static __inline__ int __DEFAULT_FN_ATTRS
1385{
1386 return __builtin_ia32_cvtss2si((__v4sf)__a);
1387}
1388
1389/// Converts a float value contained in the lower 32 bits of a vector of
1390/// [4 x float] into a 32-bit integer.
1391///
1392/// If the converted value does not fit in a 32-bit integer, raises a
1393/// floating-point invalid exception. If the exception is masked, returns
1394/// the most negative integer.
1395///
1396/// \headerfile <x86intrin.h>
1397///
1398/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1399/// instructions.
1400///
1401/// \param __a
1402/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1403/// used in the conversion.
1404/// \returns A 32-bit integer containing the converted value.
1405static __inline__ int __DEFAULT_FN_ATTRS
1407{
1408 return _mm_cvtss_si32(__a);
1409}
1410
1411#ifdef __x86_64__
1412
1413/// Converts a float value contained in the lower 32 bits of a vector of
1414/// [4 x float] into a 64-bit integer.
1415///
1416/// If the converted value does not fit in a 32-bit integer, raises a
1417/// floating-point invalid exception. If the exception is masked, returns
1418/// the most negative integer.
1419///
1420/// \headerfile <x86intrin.h>
1421///
1422/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1423/// instructions.
1424///
1425/// \param __a
1426/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1427/// used in the conversion.
1428/// \returns A 64-bit integer containing the converted value.
1429static __inline__ long long __DEFAULT_FN_ATTRS
1430_mm_cvtss_si64(__m128 __a)
1431{
1432 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1433}
1434
1435#endif
1436
1437/// Converts two low-order float values in a 128-bit vector of
1438/// [4 x float] into a 64-bit vector of [2 x i32].
1439///
1440/// If a converted value does not fit in a 32-bit integer, raises a
1441/// floating-point invalid exception. If the exception is masked, returns
1442/// the most negative integer.
1443///
1444/// \headerfile <x86intrin.h>
1445///
1446/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1447///
1448/// \param __a
1449/// A 128-bit vector of [4 x float].
1450/// \returns A 64-bit integer vector containing the converted values.
1451static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1453{
1454 return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1455}
1456
1457/// Converts two low-order float values in a 128-bit vector of
1458/// [4 x float] into a 64-bit vector of [2 x i32].
1459///
1460/// If a converted value does not fit in a 32-bit integer, raises a
1461/// floating-point invalid exception. If the exception is masked, returns
1462/// the most negative integer.
1463///
1464/// \headerfile <x86intrin.h>
1465///
1466/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1467///
1468/// \param __a
1469/// A 128-bit vector of [4 x float].
1470/// \returns A 64-bit integer vector containing the converted values.
1471static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1473{
1474 return _mm_cvtps_pi32(__a);
1475}
1476
1477/// Converts the lower (first) element of a vector of [4 x float] into a signed
1478/// truncated (rounded toward zero) 32-bit integer.
1479///
1480/// If the converted value does not fit in a 32-bit integer, raises a
1481/// floating-point invalid exception. If the exception is masked, returns
1482/// the most negative integer.
1483///
1484/// \headerfile <x86intrin.h>
1485///
1486/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1487/// instructions.
1488///
1489/// \param __a
1490/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1491/// used in the conversion.
1492/// \returns A 32-bit integer containing the converted value.
1493static __inline__ int __DEFAULT_FN_ATTRS
1495{
1496 return __builtin_ia32_cvttss2si((__v4sf)__a);
1497}
1498
1499/// Converts the lower (first) element of a vector of [4 x float] into a signed
1500/// truncated (rounded toward zero) 32-bit integer.
1501///
1502/// If the converted value does not fit in a 32-bit integer, raises a
1503/// floating-point invalid exception. If the exception is masked, returns
1504/// the most negative integer.
1505///
1506/// \headerfile <x86intrin.h>
1507///
1508/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1509/// instructions.
1510///
1511/// \param __a
1512/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1513/// used in the conversion.
1514/// \returns A 32-bit integer containing the converted value.
1515static __inline__ int __DEFAULT_FN_ATTRS
1517{
1518 return _mm_cvttss_si32(__a);
1519}
1520
1521#ifdef __x86_64__
1522/// Converts the lower (first) element of a vector of [4 x float] into a signed
1523/// truncated (rounded toward zero) 64-bit integer.
1524///
1525/// If the converted value does not fit in a 64-bit integer, raises a
1526/// floating-point invalid exception. If the exception is masked, returns
1527/// the most negative integer.
1528///
1529/// \headerfile <x86intrin.h>
1530///
1531/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1532/// instructions.
1533///
1534/// \param __a
1535/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1536/// used in the conversion.
1537/// \returns A 64-bit integer containing the converted value.
1538static __inline__ long long __DEFAULT_FN_ATTRS
1539_mm_cvttss_si64(__m128 __a)
1540{
1541 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1542}
1543#endif
1544
1545/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1546/// into two signed truncated (rounded toward zero) 32-bit integers,
1547/// returned in a 64-bit vector of [2 x i32].
1548///
1549/// If a converted value does not fit in a 32-bit integer, raises a
1550/// floating-point invalid exception. If the exception is masked, returns
1551/// the most negative integer.
1552///
1553/// \headerfile <x86intrin.h>
1554///
1555/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1556/// instructions.
1557///
1558/// \param __a
1559/// A 128-bit vector of [4 x float].
1560/// \returns A 64-bit integer vector containing the converted values.
1561static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1563{
1564 return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1565}
1566
1567/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1568/// into two signed truncated (rounded toward zero) 64-bit integers,
1569/// returned in a 64-bit vector of [2 x i32].
1570///
1571/// If a converted value does not fit in a 32-bit integer, raises a
1572/// floating-point invalid exception. If the exception is masked, returns
1573/// the most negative integer.
1574///
1575/// \headerfile <x86intrin.h>
1576///
1577/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1578///
1579/// \param __a
1580/// A 128-bit vector of [4 x float].
1581/// \returns A 64-bit integer vector containing the converted values.
1582static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
1584{
1585 return _mm_cvttps_pi32(__a);
1586}
1587
1588/// Converts a 32-bit signed integer value into a floating point value
1589/// and writes it to the lower 32 bits of the destination. The remaining
1590/// higher order elements of the destination vector are copied from the
1591/// corresponding elements in the first operand.
1592///
1593/// \headerfile <x86intrin.h>
1594///
1595/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1596///
1597/// \param __a
1598/// A 128-bit vector of [4 x float].
1599/// \param __b
1600/// A 32-bit signed integer operand containing the value to be converted.
1601/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1602/// converted value of the second operand. The upper 96 bits are copied from
1603/// the upper 96 bits of the first operand.
1604static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1605 int __b) {
1606 __a[0] = __b;
1607 return __a;
1608}
1609
1610/// Converts a 32-bit signed integer value into a floating point value
1611/// and writes it to the lower 32 bits of the destination. The remaining
1612/// higher order elements of the destination are copied from the
1613/// corresponding elements in the first operand.
1614///
1615/// \headerfile <x86intrin.h>
1616///
1617/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1618///
1619/// \param __a
1620/// A 128-bit vector of [4 x float].
1621/// \param __b
1622/// A 32-bit signed integer operand containing the value to be converted.
1623/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1624/// converted value of the second operand. The upper 96 bits are copied from
1625/// the upper 96 bits of the first operand.
1626static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1627 int __b) {
1628 return _mm_cvtsi32_ss(__a, __b);
1629}
1630
1631#ifdef __x86_64__
1632
1633/// Converts a 64-bit signed integer value into a floating point value
1634/// and writes it to the lower 32 bits of the destination. The remaining
1635/// higher order elements of the destination are copied from the
1636/// corresponding elements in the first operand.
1637///
1638/// \headerfile <x86intrin.h>
1639///
1640/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1641///
1642/// \param __a
1643/// A 128-bit vector of [4 x float].
1644/// \param __b
1645/// A 64-bit signed integer operand containing the value to be converted.
1646/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1647/// converted value of the second operand. The upper 96 bits are copied from
1648/// the upper 96 bits of the first operand.
1649static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1650_mm_cvtsi64_ss(__m128 __a, long long __b) {
1651 __a[0] = __b;
1652 return __a;
1653}
1654
1655#endif
1656
1657/// Converts two elements of a 64-bit vector of [2 x i32] into two
1658/// floating point values and writes them to the lower 64-bits of the
1659/// destination. The remaining higher order elements of the destination are
1660/// copied from the corresponding elements in the first operand.
1661///
1662/// \headerfile <x86intrin.h>
1663///
1664/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1665///
1666/// \param __a
1667/// A 128-bit vector of [4 x float].
1668/// \param __b
1669/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1670/// and written to the corresponding low-order elements in the destination.
1671/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1672/// converted value of the second operand. The upper 64 bits are copied from
1673/// the upper 64 bits of the first operand.
1674static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1675_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1676{
1677 return (__m128)__builtin_shufflevector(
1678 (__v4sf)__a,
1679 __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1680 4, 5, 2, 3);
1681}
1682
1683/// Converts two elements of a 64-bit vector of [2 x i32] into two
1684/// floating point values and writes them to the lower 64-bits of the
1685/// destination. The remaining higher order elements of the destination are
1686/// copied from the corresponding elements in the first operand.
1687///
1688/// \headerfile <x86intrin.h>
1689///
1690/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1691///
1692/// \param __a
1693/// A 128-bit vector of [4 x float].
1694/// \param __b
1695/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1696/// and written to the corresponding low-order elements in the destination.
1697/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1698/// converted value from the second operand. The upper 64 bits are copied
1699/// from the upper 64 bits of the first operand.
1700static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
1701_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1702{
1703 return _mm_cvtpi32_ps(__a, __b);
1704}
1705
1706/// Extracts a float value contained in the lower 32 bits of a vector of
1707/// [4 x float].
1708///
1709/// \headerfile <x86intrin.h>
1710///
1711/// This intrinsic has no corresponding instruction.
1712///
1713/// \param __a
1714/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1715/// used in the extraction.
1716/// \returns A 32-bit float containing the extracted value.
1717static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
1719 return __a[0];
1720}
1721
1722/// Loads two packed float values from the address \a __p into the
1723/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1724/// are copied from the low-order bits of the first operand.
1725///
1726/// \headerfile <x86intrin.h>
1727///
1728/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1729///
1730/// \param __a
1731/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1732/// of the destination.
1733/// \param __p
1734/// A pointer to two packed float values. Bits [63:0] are written to bits
1735/// [127:64] of the destination.
1736/// \returns A 128-bit vector of [4 x float] containing the moved values.
1737static __inline__ __m128 __DEFAULT_FN_ATTRS
1738_mm_loadh_pi(__m128 __a, const __m64 *__p)
1739{
1740 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1741 struct __mm_loadh_pi_struct {
1742 __mm_loadh_pi_v2f32 __u;
1743 } __attribute__((__packed__, __may_alias__));
1744 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1745 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1746 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1747}
1748
1749/// Loads two packed float values from the address \a __p into the
1750/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1751/// are copied from the high-order bits of the first operand.
1752///
1753/// \headerfile <x86intrin.h>
1754///
1755/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1756///
1757/// \param __a
1758/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1759/// [127:64] of the destination.
1760/// \param __p
1761/// A pointer to two packed float values. Bits [63:0] are written to bits
1762/// [63:0] of the destination.
1763/// \returns A 128-bit vector of [4 x float] containing the moved values.
1764static __inline__ __m128 __DEFAULT_FN_ATTRS
1765_mm_loadl_pi(__m128 __a, const __m64 *__p)
1766{
1767 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1768 struct __mm_loadl_pi_struct {
1769 __mm_loadl_pi_v2f32 __u;
1770 } __attribute__((__packed__, __may_alias__));
1771 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1772 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1773 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1774}
1775
1776/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1777/// 32 bits of the vector are initialized with the single-precision
1778/// floating-point value loaded from a specified memory location. The upper
1779/// 96 bits are set to zero.
1780///
1781/// \headerfile <x86intrin.h>
1782///
1783/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1784///
1785/// \param __p
1786/// A pointer to a 32-bit memory location containing a single-precision
1787/// floating-point value.
1788/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1789/// lower 32 bits contain the value loaded from the memory location. The
1790/// upper 96 bits are set to zero.
1791static __inline__ __m128 __DEFAULT_FN_ATTRS
1792_mm_load_ss(const float *__p)
1793{
1794 struct __mm_load_ss_struct {
1795 float __u;
1796 } __attribute__((__packed__, __may_alias__));
1797 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1798 return __extension__ (__m128){ __u, 0, 0, 0 };
1799}
1800
1801/// Loads a 32-bit float value and duplicates it to all four vector
1802/// elements of a 128-bit vector of [4 x float].
1803///
1804/// \headerfile <x86intrin.h>
1805///
1806/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1807/// instruction.
1808///
1809/// \param __p
1810/// A pointer to a float value to be loaded and duplicated.
1811/// \returns A 128-bit vector of [4 x float] containing the loaded and
1812/// duplicated values.
1813static __inline__ __m128 __DEFAULT_FN_ATTRS
1814_mm_load1_ps(const float *__p)
1815{
1816 struct __mm_load1_ps_struct {
1817 float __u;
1818 } __attribute__((__packed__, __may_alias__));
1819 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1820 return __extension__ (__m128){ __u, __u, __u, __u };
1821}
1822
1823#define _mm_load_ps1(p) _mm_load1_ps(p)
1824
1825/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1826/// memory location.
1827///
1828/// \headerfile <x86intrin.h>
1829///
1830/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1831///
1832/// \param __p
1833/// A pointer to a 128-bit memory location. The address of the memory
1834/// location has to be 128-bit aligned.
1835/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1836static __inline__ __m128 __DEFAULT_FN_ATTRS
1837_mm_load_ps(const float *__p)
1838{
1839 return *(const __m128*)__p;
1840}
1841
1842/// Loads a 128-bit floating-point vector of [4 x float] from an
1843/// unaligned memory location.
1844///
1845/// \headerfile <x86intrin.h>
1846///
1847/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1848///
1849/// \param __p
1850/// A pointer to a 128-bit memory location. The address of the memory
1851/// location does not have to be aligned.
1852/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1853static __inline__ __m128 __DEFAULT_FN_ATTRS
1854_mm_loadu_ps(const float *__p)
1855{
1856 struct __loadu_ps {
1857 __m128_u __v;
1858 } __attribute__((__packed__, __may_alias__));
1859 return ((const struct __loadu_ps*)__p)->__v;
1860}
1861
1862/// Loads four packed float values, in reverse order, from an aligned
1863/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1864///
1865/// \headerfile <x86intrin.h>
1866///
1867/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1868/// instruction.
1869///
1870/// \param __p
1871/// A pointer to a 128-bit memory location. The address of the memory
1872/// location has to be 128-bit aligned.
1873/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1874/// in reverse order.
1875static __inline__ __m128 __DEFAULT_FN_ATTRS
1876_mm_loadr_ps(const float *__p)
1877{
1878 __m128 __a = _mm_load_ps(__p);
1879 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1880}
1881
1882/// Create a 128-bit vector of [4 x float] with undefined values.
1883///
1884/// \headerfile <x86intrin.h>
1885///
1886/// This intrinsic has no corresponding instruction.
1887///
1888/// \returns A 128-bit vector of [4 x float] containing undefined values.
1889static __inline__ __m128 __DEFAULT_FN_ATTRS
1891{
1892 return (__m128)__builtin_ia32_undef128();
1893}
1894
1895/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1896/// 32 bits of the vector are initialized with the specified single-precision
1897/// floating-point value. The upper 96 bits are set to zero.
1898///
1899/// \headerfile <x86intrin.h>
1900///
1901/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1902///
1903/// \param __w
1904/// A single-precision floating-point value used to initialize the lower 32
1905/// bits of the result.
1906/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1907/// lower 32 bits contain the value provided in the source operand. The
1908/// upper 96 bits are set to zero.
1909static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1910_mm_set_ss(float __w) {
1911 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1912}
1913
1914/// Constructs a 128-bit floating-point vector of [4 x float], with each
1915/// of the four single-precision floating-point vector elements set to the
1916/// specified single-precision floating-point value.
1917///
1918/// \headerfile <x86intrin.h>
1919///
1920/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1921///
1922/// \param __w
1923/// A single-precision floating-point value used to initialize each vector
1924/// element of the result.
1925/// \returns An initialized 128-bit floating-point vector of [4 x float].
1926static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1927_mm_set1_ps(float __w) {
1928 return __extension__ (__m128){ __w, __w, __w, __w };
1929}
1930
1931/* Microsoft specific. */
1932/// Constructs a 128-bit floating-point vector of [4 x float], with each
1933/// of the four single-precision floating-point vector elements set to the
1934/// specified single-precision floating-point value.
1935///
1936/// \headerfile <x86intrin.h>
1937///
1938/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1939///
1940/// \param __w
1941/// A single-precision floating-point value used to initialize each vector
1942/// element of the result.
1943/// \returns An initialized 128-bit floating-point vector of [4 x float].
1944static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1945_mm_set_ps1(float __w) {
1946 return _mm_set1_ps(__w);
1947}
1948
1949/// Constructs a 128-bit floating-point vector of [4 x float]
1950/// initialized with the specified single-precision floating-point values.
1951///
1952/// \headerfile <x86intrin.h>
1953///
1954/// This intrinsic is a utility function and does not correspond to a specific
1955/// instruction.
1956///
1957/// \param __z
1958/// A single-precision floating-point value used to initialize bits [127:96]
1959/// of the result.
1960/// \param __y
1961/// A single-precision floating-point value used to initialize bits [95:64]
1962/// of the result.
1963/// \param __x
1964/// A single-precision floating-point value used to initialize bits [63:32]
1965/// of the result.
1966/// \param __w
1967/// A single-precision floating-point value used to initialize bits [31:0]
1968/// of the result.
1969/// \returns An initialized 128-bit floating-point vector of [4 x float].
1970static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1971_mm_set_ps(float __z, float __y, float __x, float __w) {
1972 return __extension__ (__m128){ __w, __x, __y, __z };
1973}
1974
1975/// Constructs a 128-bit floating-point vector of [4 x float],
1976/// initialized in reverse order with the specified 32-bit single-precision
1977/// float-point values.
1978///
1979/// \headerfile <x86intrin.h>
1980///
1981/// This intrinsic is a utility function and does not correspond to a specific
1982/// instruction.
1983///
1984/// \param __z
1985/// A single-precision floating-point value used to initialize bits [31:0]
1986/// of the result.
1987/// \param __y
1988/// A single-precision floating-point value used to initialize bits [63:32]
1989/// of the result.
1990/// \param __x
1991/// A single-precision floating-point value used to initialize bits [95:64]
1992/// of the result.
1993/// \param __w
1994/// A single-precision floating-point value used to initialize bits [127:96]
1995/// of the result.
1996/// \returns An initialized 128-bit floating-point vector of [4 x float].
1997static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
1998_mm_setr_ps(float __z, float __y, float __x, float __w) {
1999 return __extension__ (__m128){ __z, __y, __x, __w };
2000}
2001
2002/// Constructs a 128-bit floating-point vector of [4 x float] initialized
2003/// to zero.
2004///
2005/// \headerfile <x86intrin.h>
2006///
2007/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2008///
2009/// \returns An initialized 128-bit floating-point vector of [4 x float] with
2010/// all elements set to zero.
2011static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2013 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2014}
2015
2016/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2017/// memory location.
2018///
2019/// \headerfile <x86intrin.h>
2020///
2021/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2022///
2023/// \param __p
2024/// A pointer to a 64-bit memory location.
2025/// \param __a
2026/// A 128-bit vector of [4 x float] containing the values to be stored.
2027static __inline__ void __DEFAULT_FN_ATTRS
2028_mm_storeh_pi(__m64 *__p, __m128 __a)
2029{
2030 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2031 struct __mm_storeh_pi_struct {
2032 __mm_storeh_pi_v2f32 __u;
2033 } __attribute__((__packed__, __may_alias__));
2034 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2035}
2036
2037/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2038/// memory location.
2039///
2040/// \headerfile <x86intrin.h>
2041///
2042/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2043///
2044/// \param __p
2045/// A pointer to a memory location that will receive the float values.
2046/// \param __a
2047/// A 128-bit vector of [4 x float] containing the values to be stored.
2048static __inline__ void __DEFAULT_FN_ATTRS
2049_mm_storel_pi(__m64 *__p, __m128 __a)
2050{
2051 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2052 struct __mm_storeh_pi_struct {
2053 __mm_storeh_pi_v2f32 __u;
2054 } __attribute__((__packed__, __may_alias__));
2055 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2056}
2057
2058/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2059/// memory location.
2060///
2061/// \headerfile <x86intrin.h>
2062///
2063/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2064///
2065/// \param __p
2066/// A pointer to a 32-bit memory location.
2067/// \param __a
2068/// A 128-bit vector of [4 x float] containing the value to be stored.
2069static __inline__ void __DEFAULT_FN_ATTRS
2070_mm_store_ss(float *__p, __m128 __a)
2071{
2072 struct __mm_store_ss_struct {
2073 float __u;
2074 } __attribute__((__packed__, __may_alias__));
2075 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2076}
2077
2078/// Stores a 128-bit vector of [4 x float] to an unaligned memory
2079/// location.
2080///
2081/// \headerfile <x86intrin.h>
2082///
2083/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2084///
2085/// \param __p
2086/// A pointer to a 128-bit memory location. The address of the memory
2087/// location does not have to be aligned.
2088/// \param __a
2089/// A 128-bit vector of [4 x float] containing the values to be stored.
2090static __inline__ void __DEFAULT_FN_ATTRS
2091_mm_storeu_ps(float *__p, __m128 __a)
2092{
2093 struct __storeu_ps {
2094 __m128_u __v;
2095 } __attribute__((__packed__, __may_alias__));
2096 ((struct __storeu_ps*)__p)->__v = __a;
2097}
2098
2099/// Stores a 128-bit vector of [4 x float] into an aligned memory
2100/// location.
2101///
2102/// \headerfile <x86intrin.h>
2103///
2104/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2105///
2106/// \param __p
2107/// A pointer to a 128-bit memory location. The address of the memory
2108/// location has to be 16-byte aligned.
2109/// \param __a
2110/// A 128-bit vector of [4 x float] containing the values to be stored.
2111static __inline__ void __DEFAULT_FN_ATTRS
2112_mm_store_ps(float *__p, __m128 __a)
2113{
2114 *(__m128*)__p = __a;
2115}
2116
2117/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2118/// four contiguous elements in an aligned memory location.
2119///
2120/// \headerfile <x86intrin.h>
2121///
2122/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2123/// instruction.
2124///
2125/// \param __p
2126/// A pointer to a 128-bit memory location.
2127/// \param __a
2128/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2129/// of the four contiguous elements pointed by \a __p.
2130static __inline__ void __DEFAULT_FN_ATTRS
2131_mm_store1_ps(float *__p, __m128 __a)
2132{
2133 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2135}
2136
2137/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2138/// four contiguous elements in an aligned memory location.
2139///
2140/// \headerfile <x86intrin.h>
2141///
2142/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2143/// instruction.
2144///
2145/// \param __p
2146/// A pointer to a 128-bit memory location.
2147/// \param __a
2148/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2149/// of the four contiguous elements pointed by \a __p.
2150static __inline__ void __DEFAULT_FN_ATTRS
2151_mm_store_ps1(float *__p, __m128 __a)
2152{
2154}
2155
2156/// Stores float values from a 128-bit vector of [4 x float] to an
2157/// aligned memory location in reverse order.
2158///
2159/// \headerfile <x86intrin.h>
2160///
2161/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2162/// instruction.
2163///
2164/// \param __p
2165/// A pointer to a 128-bit memory location. The address of the memory
2166/// location has to be 128-bit aligned.
2167/// \param __a
2168/// A 128-bit vector of [4 x float] containing the values to be stored.
2169static __inline__ void __DEFAULT_FN_ATTRS
2170_mm_storer_ps(float *__p, __m128 __a)
2171{
2172 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2174}
2175
2176#define _MM_HINT_ET0 7
2177#define _MM_HINT_ET1 6
2178#define _MM_HINT_T0 3
2179#define _MM_HINT_T1 2
2180#define _MM_HINT_T2 1
2181#define _MM_HINT_NTA 0
2182
2183#ifndef _MSC_VER
2184// If _MSC_VER is defined, we use the builtin variant of _mm_prefetch.
2185// Otherwise, we provide this macro, which includes a cast, allowing the user
2186// to pass a pointer of any time. The _mm_prefetch accepts char to match MSVC.
2187
2188/// Loads one cache line of data from the specified address to a location
2189/// closer to the processor.
2190///
2191/// \headerfile <x86intrin.h>
2192///
2193/// \code
2194/// void _mm_prefetch(const void *a, const int sel);
2195/// \endcode
2196///
2197/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2198///
2199/// \param a
2200/// A pointer to a memory location containing a cache line of data.
2201/// \param sel
2202/// A predefined integer constant specifying the type of prefetch
2203/// operation: \n
2204/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2205/// PREFETCHNTA instruction will be generated. \n
2206/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2207/// be generated. \n
2208/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2209/// be generated. \n
2210/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2211/// be generated.
2212#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2213 ((sel) >> 2) & 1, (sel) & 0x3))
2214#endif
2215
2216/// Stores a 64-bit integer in the specified aligned memory location. To
2217/// minimize caching, the data is flagged as non-temporal (unlikely to be
2218/// used again soon).
2219///
2220/// \headerfile <x86intrin.h>
2221///
2222/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2223///
2224/// \param __p
2225/// A pointer to an aligned memory location used to store the register value.
2226/// \param __a
2227/// A 64-bit integer containing the value to be stored.
2228static __inline__ void __DEFAULT_FN_ATTRS
2229_mm_stream_pi(void *__p, __m64 __a)
2230{
2231 __builtin_nontemporal_store(__a, (__m64 *)__p);
2232}
2233
2234/// Moves packed float values from a 128-bit vector of [4 x float] to a
2235/// 128-bit aligned memory location. To minimize caching, the data is flagged
2236/// as non-temporal (unlikely to be used again soon).
2237///
2238/// \headerfile <x86intrin.h>
2239///
2240/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2241///
2242/// \param __p
2243/// A pointer to a 128-bit aligned memory location that will receive the
2244/// single-precision floating-point values.
2245/// \param __a
2246/// A 128-bit vector of [4 x float] containing the values to be moved.
2247static __inline__ void __DEFAULT_FN_ATTRS
2248_mm_stream_ps(void *__p, __m128 __a)
2249{
2250 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2251}
2252
2253#if defined(__cplusplus)
2254extern "C" {
2255#endif
2256
2257/// Forces strong memory ordering (serialization) between store
2258/// instructions preceding this instruction and store instructions following
2259/// this instruction, ensuring the system completes all previous stores
2260/// before executing subsequent stores.
2261///
2262/// \headerfile <x86intrin.h>
2263///
2264/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2265///
2266void _mm_sfence(void);
2267
2268#if defined(__cplusplus)
2269} // extern "C"
2270#endif
2271
2272/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2273/// returns it, as specified by the immediate integer operand.
2274///
2275/// \headerfile <x86intrin.h>
2276///
2277/// \code
2278/// int _mm_extract_pi16(__m64 a, int n);
2279/// \endcode
2280///
2281/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2282///
2283/// \param a
2284/// A 64-bit vector of [4 x i16].
2285/// \param n
2286/// An immediate integer operand that determines which bits are extracted: \n
2287/// 0: Bits [15:0] are copied to the destination. \n
2288/// 1: Bits [31:16] are copied to the destination. \n
2289/// 2: Bits [47:32] are copied to the destination. \n
2290/// 3: Bits [63:48] are copied to the destination.
2291/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2292#define _mm_extract_pi16(a, n) \
2293 ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2294
2295/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2296/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2297/// specified by the immediate operand \a n.
2298///
2299/// \headerfile <x86intrin.h>
2300///
2301/// \code
2302/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2303/// \endcode
2304///
2305/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2306///
2307/// \param a
2308/// A 64-bit vector of [4 x i16].
2309/// \param d
2310/// An integer. The lower 16-bit value from this operand is written to the
2311/// destination at the offset specified by operand \a n.
2312/// \param n
2313/// An immediate integer operant that determines which the bits to be used
2314/// in the destination. \n
2315/// 0: Bits [15:0] are copied to the destination. \n
2316/// 1: Bits [31:16] are copied to the destination. \n
2317/// 2: Bits [47:32] are copied to the destination. \n
2318/// 3: Bits [63:48] are copied to the destination. \n
2319/// The remaining bits in the destination are copied from the corresponding
2320/// bits in operand \a a.
2321/// \returns A 64-bit integer vector containing the copied packed data from the
2322/// operands.
2323#define _mm_insert_pi16(a, d, n) \
2324 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2325
2326/// Compares each of the corresponding packed 16-bit integer values of
2327/// the 64-bit integer vectors, and writes the greater value to the
2328/// corresponding bits in the destination.
2329///
2330/// \headerfile <x86intrin.h>
2331///
2332/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2333///
2334/// \param __a
2335/// A 64-bit integer vector containing one of the source operands.
2336/// \param __b
2337/// A 64-bit integer vector containing one of the source operands.
2338/// \returns A 64-bit integer vector containing the comparison results.
2339static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2340_mm_max_pi16(__m64 __a, __m64 __b) {
2341 return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2342}
2343
2344/// Compares each of the corresponding packed 8-bit unsigned integer
2345/// values of the 64-bit integer vectors, and writes the greater value to the
2346/// corresponding bits in the destination.
2347///
2348/// \headerfile <x86intrin.h>
2349///
2350/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2351///
2352/// \param __a
2353/// A 64-bit integer vector containing one of the source operands.
2354/// \param __b
2355/// A 64-bit integer vector containing one of the source operands.
2356/// \returns A 64-bit integer vector containing the comparison results.
2357static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2358_mm_max_pu8(__m64 __a, __m64 __b) {
2359 return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2360}
2361
2362/// Compares each of the corresponding packed 16-bit integer values of
2363/// the 64-bit integer vectors, and writes the lesser value to the
2364/// corresponding bits in the destination.
2365///
2366/// \headerfile <x86intrin.h>
2367///
2368/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2369///
2370/// \param __a
2371/// A 64-bit integer vector containing one of the source operands.
2372/// \param __b
2373/// A 64-bit integer vector containing one of the source operands.
2374/// \returns A 64-bit integer vector containing the comparison results.
2375static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2376_mm_min_pi16(__m64 __a, __m64 __b) {
2377 return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2378}
2379
2380/// Compares each of the corresponding packed 8-bit unsigned integer
2381/// values of the 64-bit integer vectors, and writes the lesser value to the
2382/// corresponding bits in the destination.
2383///
2384/// \headerfile <x86intrin.h>
2385///
2386/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2387///
2388/// \param __a
2389/// A 64-bit integer vector containing one of the source operands.
2390/// \param __b
2391/// A 64-bit integer vector containing one of the source operands.
2392/// \returns A 64-bit integer vector containing the comparison results.
2393static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2394_mm_min_pu8(__m64 __a, __m64 __b) {
2395 return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2396}
2397
2398/// Takes the most significant bit from each 8-bit element in a 64-bit
2399/// integer vector to create an 8-bit mask value. Zero-extends the value to
2400/// 32-bit integer and writes it to the destination.
2401///
2402/// \headerfile <x86intrin.h>
2403///
2404/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2405///
2406/// \param __a
2407/// A 64-bit integer vector containing the values with bits to be extracted.
2408/// \returns The most significant bit from each 8-bit element in \a __a,
2409/// written to bits [7:0].
2410static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2412 return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2413}
2414
2415/// Multiplies packed 16-bit unsigned integer values and writes the
2416/// high-order 16 bits of each 32-bit product to the corresponding bits in
2417/// the destination.
2418///
2419/// \headerfile <x86intrin.h>
2420///
2421/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2422///
2423/// \param __a
2424/// A 64-bit integer vector containing one of the source operands.
2425/// \param __b
2426/// A 64-bit integer vector containing one of the source operands.
2427/// \returns A 64-bit integer vector containing the products of both operands.
2428static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2430{
2431 return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a),
2432 (__v8hu)__zext128(__b)));
2433}
2434
2435/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2436/// destination, as specified by the immediate value operand.
2437///
2438/// \headerfile <x86intrin.h>
2439///
2440/// \code
2441/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2442/// \endcode
2443///
2444/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2445///
2446/// \param a
2447/// A 64-bit integer vector containing the values to be shuffled.
2448/// \param n
2449/// An immediate value containing an 8-bit value specifying which elements to
2450/// copy from \a a. The destinations within the 64-bit destination are
2451/// assigned values as follows: \n
2452/// Bits [1:0] are used to assign values to bits [15:0] in the
2453/// destination. \n
2454/// Bits [3:2] are used to assign values to bits [31:16] in the
2455/// destination. \n
2456/// Bits [5:4] are used to assign values to bits [47:32] in the
2457/// destination. \n
2458/// Bits [7:6] are used to assign values to bits [63:48] in the
2459/// destination. \n
2460/// Bit value assignments: \n
2461/// 00: assigned from bits [15:0] of \a a. \n
2462/// 01: assigned from bits [31:16] of \a a. \n
2463/// 10: assigned from bits [47:32] of \a a. \n
2464/// 11: assigned from bits [63:48] of \a a. \n
2465/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2466/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2467/// <c>[b6, b4, b2, b0]</c>.
2468/// \returns A 64-bit integer vector containing the shuffled values.
2469#define _mm_shuffle_pi16(a, n) \
2470 ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2471 (n) & 0x3, ((n) >> 2) & 0x3, \
2472 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2473
2474/// Conditionally copies the values from each 8-bit element in the first
2475/// 64-bit integer vector operand to the specified memory location, as
2476/// specified by the most significant bit in the corresponding element in the
2477/// second 64-bit integer vector operand.
2478///
2479/// To minimize caching, the data is flagged as non-temporal
2480/// (unlikely to be used again soon).
2481///
2482/// \headerfile <x86intrin.h>
2483///
2484/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2485///
2486/// \param __d
2487/// A 64-bit integer vector containing the values with elements to be copied.
2488/// \param __n
2489/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2490/// element determines whether the corresponding element in operand \a __d
2491/// is copied. If the most significant bit of a given element is 1, the
2492/// corresponding element in operand \a __d is copied.
2493/// \param __p
2494/// A pointer to a 64-bit memory location that will receive the conditionally
2495/// copied integer values. The address of the memory location does not have
2496/// to be aligned.
2497static __inline__ void __DEFAULT_FN_ATTRS_SSE2
2498_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2499{
2500 // This is complex, because we need to support the case where __p is pointing
2501 // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2502 // write might cause a trap where a 64-bit maskmovq would not. (Memory
2503 // locations not selected by the mask bits might still cause traps.)
2504 __m128i __d128 = __anyext128(__d);
2505 __m128i __n128 = __zext128(__n);
2506 if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2507 ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2508 // If there's a risk of spurious trap due to a 128-bit write, back up the
2509 // pointer by 8 bytes and shift values in registers to match.
2510 __p -= 8;
2511 __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
2512 __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
2513 }
2514
2515 __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2516}
2517
2518/// Computes the rounded averages of the packed unsigned 8-bit integer
2519/// values and writes the averages to the corresponding bits in the
2520/// destination.
2521///
2522/// \headerfile <x86intrin.h>
2523///
2524/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2525///
2526/// \param __a
2527/// A 64-bit integer vector containing one of the source operands.
2528/// \param __b
2529/// A 64-bit integer vector containing one of the source operands.
2530/// \returns A 64-bit integer vector containing the averages of both operands.
2531static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2532_mm_avg_pu8(__m64 __a, __m64 __b) {
2533 return __trunc64(__builtin_ia32_pavgb128((__v16qu)__zext128(__a),
2534 (__v16qu)__zext128(__b)));
2535}
2536
2537/// Computes the rounded averages of the packed unsigned 16-bit integer
2538/// values and writes the averages to the corresponding bits in the
2539/// destination.
2540///
2541/// \headerfile <x86intrin.h>
2542///
2543/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2544///
2545/// \param __a
2546/// A 64-bit integer vector containing one of the source operands.
2547/// \param __b
2548/// A 64-bit integer vector containing one of the source operands.
2549/// \returns A 64-bit integer vector containing the averages of both operands.
2550static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2551_mm_avg_pu16(__m64 __a, __m64 __b) {
2552 return __trunc64(
2553 __builtin_ia32_pavgw128((__v8hu)__zext128(__a), (__v8hu)__zext128(__b)));
2554}
2555
2556/// Subtracts the corresponding 8-bit unsigned integer values of the two
2557/// 64-bit vector operands and computes the absolute value for each of the
2558/// difference. Then sum of the 8 absolute differences is written to the
2559/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2560///
2561/// \headerfile <x86intrin.h>
2562///
2563/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2564///
2565/// \param __a
2566/// A 64-bit integer vector containing one of the source operands.
2567/// \param __b
2568/// A 64-bit integer vector containing one of the source operands.
2569/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2570/// sets of absolute differences between both operands. The upper bits are
2571/// cleared.
2572static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2573_mm_sad_pu8(__m64 __a, __m64 __b)
2574{
2575 return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2576 (__v16qi)__zext128(__b)));
2577}
2578
2579#if defined(__cplusplus)
2580extern "C" {
2581#endif
2582
2583/// Returns the contents of the MXCSR register as a 32-bit unsigned
2584/// integer value.
2585///
2586/// There are several groups of macros associated with this
2587/// intrinsic, including:
2588/// <ul>
2589/// <li>
2590/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2591/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2592/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2593/// _MM_GET_EXCEPTION_STATE().
2594/// </li>
2595/// <li>
2596/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2597/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2598/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2599/// </li>
2600/// <li>
2601/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2602/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2603/// _MM_GET_ROUNDING_MODE().
2604/// </li>
2605/// <li>
2606/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2607/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2608/// </li>
2609/// <li>
2610/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2611/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2612/// _MM_GET_DENORMALS_ZERO_MODE().
2613/// </li>
2614/// </ul>
2615///
2616/// For example, the following expression checks if an overflow exception has
2617/// occurred:
2618/// \code
2619/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2620/// \endcode
2621///
2622/// The following expression gets the current rounding mode:
2623/// \code
2624/// _MM_GET_ROUNDING_MODE()
2625/// \endcode
2626///
2627/// \headerfile <x86intrin.h>
2628///
2629/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2630///
2631/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2632/// register.
2633unsigned int _mm_getcsr(void);
2634
2635/// Sets the MXCSR register with the 32-bit unsigned integer value.
2636///
2637/// There are several groups of macros associated with this intrinsic,
2638/// including:
2639/// <ul>
2640/// <li>
2641/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2642/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2643/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2644/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2645/// </li>
2646/// <li>
2647/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2648/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2649/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2650/// of these macros.
2651/// </li>
2652/// <li>
2653/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2654/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2655/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2656/// </li>
2657/// <li>
2658/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2659/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2660/// one of these macros.
2661/// </li>
2662/// <li>
2663/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2664/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2665/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2666/// </li>
2667/// </ul>
2668///
2669/// For example, the following expression causes subsequent floating-point
2670/// operations to round up:
2671/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2672///
2673/// The following example sets the DAZ and FTZ flags:
2674/// \code
2675/// void setFlags() {
2676/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2677/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2678/// }
2679/// \endcode
2680///
2681/// \headerfile <x86intrin.h>
2682///
2683/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2684///
2685/// \param __i
2686/// A 32-bit unsigned integer value to be written to the MXCSR register.
2687void _mm_setcsr(unsigned int __i);
2688
2689#if defined(__cplusplus)
2690} // extern "C"
2691#endif
2692
2693/// Selects 4 float values from the 128-bit operands of [4 x float], as
2694/// specified by the immediate value operand.
2695///
2696/// \headerfile <x86intrin.h>
2697///
2698/// \code
2699/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2700/// \endcode
2701///
2702/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2703///
2704/// \param a
2705/// A 128-bit vector of [4 x float].
2706/// \param b
2707/// A 128-bit vector of [4 x float].
2708/// \param mask
2709/// An immediate value containing an 8-bit value specifying which elements to
2710/// copy from \a a and \a b. \n
2711/// Bits [3:0] specify the values copied from operand \a a. \n
2712/// Bits [7:4] specify the values copied from operand \a b. \n
2713/// The destinations within the 128-bit destination are assigned values as
2714/// follows: \n
2715/// Bits [1:0] are used to assign values to bits [31:0] in the
2716/// destination. \n
2717/// Bits [3:2] are used to assign values to bits [63:32] in the
2718/// destination. \n
2719/// Bits [5:4] are used to assign values to bits [95:64] in the
2720/// destination. \n
2721/// Bits [7:6] are used to assign values to bits [127:96] in the
2722/// destination. \n
2723/// Bit value assignments: \n
2724/// 00: Bits [31:0] copied from the specified operand. \n
2725/// 01: Bits [63:32] copied from the specified operand. \n
2726/// 10: Bits [95:64] copied from the specified operand. \n
2727/// 11: Bits [127:96] copied from the specified operand. \n
2728/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2729/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2730/// <c>[b6, b4, b2, b0]</c>.
2731/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2732#define _mm_shuffle_ps(a, b, mask) \
2733 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2734 (int)(mask)))
2735
2736/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2737/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2738///
2739/// \headerfile <x86intrin.h>
2740///
2741/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2742///
2743/// \param __a
2744/// A 128-bit vector of [4 x float]. \n
2745/// Bits [95:64] are written to bits [31:0] of the destination. \n
2746/// Bits [127:96] are written to bits [95:64] of the destination.
2747/// \param __b
2748/// A 128-bit vector of [4 x float].
2749/// Bits [95:64] are written to bits [63:32] of the destination. \n
2750/// Bits [127:96] are written to bits [127:96] of the destination.
2751/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2752static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2753_mm_unpackhi_ps(__m128 __a, __m128 __b) {
2754 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2755}
2756
2757/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2758/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2759///
2760/// \headerfile <x86intrin.h>
2761///
2762/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2763///
2764/// \param __a
2765/// A 128-bit vector of [4 x float]. \n
2766/// Bits [31:0] are written to bits [31:0] of the destination. \n
2767/// Bits [63:32] are written to bits [95:64] of the destination.
2768/// \param __b
2769/// A 128-bit vector of [4 x float]. \n
2770/// Bits [31:0] are written to bits [63:32] of the destination. \n
2771/// Bits [63:32] are written to bits [127:96] of the destination.
2772/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2773static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2774_mm_unpacklo_ps(__m128 __a, __m128 __b) {
2775 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2776}
2777
2778/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2779/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2780/// 96 bits are set to the upper 96 bits of the first parameter.
2781///
2782/// \headerfile <x86intrin.h>
2783///
2784/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2785/// instruction.
2786///
2787/// \param __a
2788/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2789/// written to the upper 96 bits of the result.
2790/// \param __b
2791/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2792/// written to the lower 32 bits of the result.
2793/// \returns A 128-bit floating-point vector of [4 x float].
2794static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2795_mm_move_ss(__m128 __a, __m128 __b) {
2796 __a[0] = __b[0];
2797 return __a;
2798}
2799
2800/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2801/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2802/// 64 bits are set to the upper 64 bits of the first parameter.
2803///
2804/// \headerfile <x86intrin.h>
2805///
2806/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2807///
2808/// \param __a
2809/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2810/// written to the upper 64 bits of the result.
2811/// \param __b
2812/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2813/// written to the lower 64 bits of the result.
2814/// \returns A 128-bit floating-point vector of [4 x float].
2815static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2816_mm_movehl_ps(__m128 __a, __m128 __b) {
2817 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2818}
2819
2820/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2821/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2822/// 64 bits are set to the lower 64 bits of the second parameter.
2823///
2824/// \headerfile <x86intrin.h>
2825///
2826/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2827///
2828/// \param __a
2829/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2830/// written to the lower 64 bits of the result.
2831/// \param __b
2832/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2833/// written to the upper 64 bits of the result.
2834/// \returns A 128-bit floating-point vector of [4 x float].
2835static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
2836_mm_movelh_ps(__m128 __a, __m128 __b) {
2837 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2838}
2839
2840/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2841/// float].
2842///
2843/// \headerfile <x86intrin.h>
2844///
2845/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2846///
2847/// \param __a
2848/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2849/// from the corresponding elements in this operand.
2850/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2851/// values from the operand.
2852static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2854{
2855 return __builtin_convertvector((__v4hi)__a, __v4sf);
2856}
2857
2858/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2859/// 128-bit vector of [4 x float].
2860///
2861/// \headerfile <x86intrin.h>
2862///
2863/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2864///
2865/// \param __a
2866/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2867/// destination are copied from the corresponding elements in this operand.
2868/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2869/// values from the operand.
2870static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2872{
2873 return __builtin_convertvector((__v4hu)__a, __v4sf);
2874}
2875
2876/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2877/// into a 128-bit vector of [4 x float].
2878///
2879/// \headerfile <x86intrin.h>
2880///
2881/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2882///
2883/// \param __a
2884/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2885/// from the corresponding lower 4 elements in this operand.
2886/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2887/// values from the operand.
2888static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2890{
2891 return __builtin_convertvector(
2892 __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2893 0, 1, 2, 3), __v4sf);
2894}
2895
2896/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2897/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2898///
2899/// \headerfile <x86intrin.h>
2900///
2901/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2902///
2903/// \param __a
2904/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2905/// destination are copied from the corresponding lower 4 elements in this
2906/// operand.
2907/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2908/// values from the source operand.
2909static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2911{
2912 return __builtin_convertvector(
2913 __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2914 0, 1, 2, 3), __v4sf);
2915}
2916
2917/// Converts the two 32-bit signed integer values from each 64-bit vector
2918/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2919///
2920/// \headerfile <x86intrin.h>
2921///
2922/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2923///
2924/// \param __a
2925/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2926/// copied from the elements in this operand.
2927/// \param __b
2928/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2929/// copied from the elements in this operand.
2930/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2931/// copied and converted values from the first operand. The upper 64 bits
2932/// contain the copied and converted values from the second operand.
2933static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
2935{
2936 return __builtin_convertvector(
2937 __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2938 0, 1, 2, 3), __v4sf);
2939}
2940
2941/// Converts each single-precision floating-point element of a 128-bit
2942/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2943/// packs the results into a 64-bit integer vector of [4 x i16].
2944///
2945/// If the floating-point element is NaN or infinity, or if the
2946/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2947/// it is converted to 0x8000. Otherwise if the floating-point element is
2948/// greater than 0x7FFF, it is converted to 0x7FFF.
2949///
2950/// \headerfile <x86intrin.h>
2951///
2952/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2953///
2954/// \param __a
2955/// A 128-bit floating-point vector of [4 x float].
2956/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2957/// values.
2958static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2960{
2961 return __trunc64(__builtin_ia32_packssdw128(
2962 (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2963}
2964
2965/// Converts each single-precision floating-point element of a 128-bit
2966/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2967/// packs the results into the lower 32 bits of a 64-bit integer vector of
2968/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2969///
2970/// If the floating-point element is NaN or infinity, or if the
2971/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2972/// is converted to 0x80. Otherwise if the floating-point element is greater
2973/// than 0x7F, it is converted to 0x7F.
2974///
2975/// \headerfile <x86intrin.h>
2976///
2977/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2978///
2979/// \param __a
2980/// 128-bit floating-point vector of [4 x float].
2981/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2982/// converted values and the uppper 32 bits are set to zero.
2983static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
2985{
2986 __m64 __b, __c;
2987
2990
2991 return _mm_packs_pi16(__b, __c);
2992}
2993
2994/// Extracts the sign bits from each single-precision floating-point
2995/// element of a 128-bit floating-point vector of [4 x float] and returns the
2996/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2997/// to zero.
2998///
2999/// \headerfile <x86intrin.h>
3000///
3001/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3002///
3003/// \param __a
3004/// A 128-bit floating-point vector of [4 x float].
3005/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3006/// single-precision floating-point element of the parameter. Bits [31:4] are
3007/// set to zero.
3008static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a) {
3009 return __builtin_ia32_movmskps((__v4sf)__a);
3010}
3011
3012/* Compare */
3013#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3014#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3015#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3016#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3017#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3018#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3019#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3020#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3021
3022/// Compares each of the corresponding values of two 128-bit vectors of
3023/// [4 x float], using the operation specified by the immediate integer
3024/// operand.
3025///
3026/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3027/// If either value in a comparison is NaN, comparisons that are ordered
3028/// return false, and comparisons that are unordered return true.
3029///
3030/// \headerfile <x86intrin.h>
3031///
3032/// \code
3033/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3034/// \endcode
3035///
3036/// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3037///
3038/// \param a
3039/// A 128-bit vector of [4 x float].
3040/// \param b
3041/// A 128-bit vector of [4 x float].
3042/// \param c
3043/// An immediate integer operand, with bits [4:0] specifying which comparison
3044/// operation to use: \n
3045/// 0x00: Equal (ordered, non-signaling) \n
3046/// 0x01: Less-than (ordered, signaling) \n
3047/// 0x02: Less-than-or-equal (ordered, signaling) \n
3048/// 0x03: Unordered (non-signaling) \n
3049/// 0x04: Not-equal (unordered, non-signaling) \n
3050/// 0x05: Not-less-than (unordered, signaling) \n
3051/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3052/// 0x07: Ordered (non-signaling) \n
3053/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3054#define _mm_cmp_ps(a, b, c) \
3055 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3056
3057/// Compares each of the corresponding scalar values of two 128-bit
3058/// vectors of [4 x float], using the operation specified by the immediate
3059/// integer operand.
3060///
3061/// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3062/// If either value in a comparison is NaN, comparisons that are ordered
3063/// return false, and comparisons that are unordered return true.
3064///
3065/// \headerfile <x86intrin.h>
3066///
3067/// \code
3068/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3069/// \endcode
3070///
3071/// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3072///
3073/// \param a
3074/// A 128-bit vector of [4 x float].
3075/// \param b
3076/// A 128-bit vector of [4 x float].
3077/// \param c
3078/// An immediate integer operand, with bits [4:0] specifying which comparison
3079/// operation to use: \n
3080/// 0x00: Equal (ordered, non-signaling) \n
3081/// 0x01: Less-than (ordered, signaling) \n
3082/// 0x02: Less-than-or-equal (ordered, signaling) \n
3083/// 0x03: Unordered (non-signaling) \n
3084/// 0x04: Not-equal (unordered, non-signaling) \n
3085/// 0x05: Not-less-than (unordered, signaling) \n
3086/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3087/// 0x07: Ordered (non-signaling) \n
3088/// \returns A 128-bit vector of [4 x float] containing the comparison results.
3089#define _mm_cmp_ss(a, b, c) \
3090 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3091
3092#define _MM_ALIGN16 __attribute__((aligned(16)))
3093
3094#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3095
3096#define _MM_EXCEPT_INVALID (0x0001U)
3097#define _MM_EXCEPT_DENORM (0x0002U)
3098#define _MM_EXCEPT_DIV_ZERO (0x0004U)
3099#define _MM_EXCEPT_OVERFLOW (0x0008U)
3100#define _MM_EXCEPT_UNDERFLOW (0x0010U)
3101#define _MM_EXCEPT_INEXACT (0x0020U)
3102#define _MM_EXCEPT_MASK (0x003fU)
3103
3104#define _MM_MASK_INVALID (0x0080U)
3105#define _MM_MASK_DENORM (0x0100U)
3106#define _MM_MASK_DIV_ZERO (0x0200U)
3107#define _MM_MASK_OVERFLOW (0x0400U)
3108#define _MM_MASK_UNDERFLOW (0x0800U)
3109#define _MM_MASK_INEXACT (0x1000U)
3110#define _MM_MASK_MASK (0x1f80U)
3111
3112#define _MM_ROUND_NEAREST (0x0000U)
3113#define _MM_ROUND_DOWN (0x2000U)
3114#define _MM_ROUND_UP (0x4000U)
3115#define _MM_ROUND_TOWARD_ZERO (0x6000U)
3116#define _MM_ROUND_MASK (0x6000U)
3117
3118#define _MM_FLUSH_ZERO_MASK (0x8000U)
3119#define _MM_FLUSH_ZERO_ON (0x8000U)
3120#define _MM_FLUSH_ZERO_OFF (0x0000U)
3121
3122#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3123#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3124#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3125#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3126
3127#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3128#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3129#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3130#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3131
3132#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3133do { \
3134 __m128 tmp3, tmp2, tmp1, tmp0; \
3135 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3136 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3137 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3138 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3139 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3140 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3141 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3142 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3143} while (0)
3144
3145/* Aliases for compatibility. */
3146#define _m_pextrw _mm_extract_pi16
3147#define _m_pinsrw _mm_insert_pi16
3148#define _m_pmaxsw _mm_max_pi16
3149#define _m_pmaxub _mm_max_pu8
3150#define _m_pminsw _mm_min_pi16
3151#define _m_pminub _mm_min_pu8
3152#define _m_pmovmskb _mm_movemask_pi8
3153#define _m_pmulhuw _mm_mulhi_pu16
3154#define _m_pshufw _mm_shuffle_pi16
3155#define _m_maskmovq _mm_maskmove_si64
3156#define _m_pavgb _mm_avg_pu8
3157#define _m_pavgw _mm_avg_pu16
3158#define _m_psadbw _mm_sad_pu8
3159#define _m_ _mm_
3160
3161#undef __trunc64
3162#undef __zext128
3163#undef __anyext128
3164#undef __zeroupper64
3165#undef __DEFAULT_FN_ATTRS
3166#undef __DEFAULT_FN_ATTRS_CONSTEXPR
3167#undef __DEFAULT_FN_ATTRS_SSE2
3168#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3169
3170/* Ugly hack for backwards-compatibility (compatible with gcc) */
3171#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3172#include <emmintrin.h>
3173#endif
3174
3175#endif /* __XMMINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
#define __DEFAULT_FN_ATTRS
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
static __inline__ uint32_t uint32_t __y
Definition arm_acle.h:131
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition mmintrin.h:148
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition mmintrin.h:1273
#define __DEFAULT_FN_ATTRS_SSE2
Definition mmintrin.h:47
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1172
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1494
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2551
static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2411
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:267
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2795
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:573
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1701
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:234
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:948
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:525
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2853
#define __anyext128(x)
Definition xmmintrin.h:56
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:551
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2532
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1472
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1406
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1675
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:504
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2429
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:160
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
Definition xmmintrin.h:48
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1837
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:769
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1148
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2934
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2170
#define __zeroupper64(x)
Definition xmmintrin.h:59
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1890
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:846
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1971
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1267
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:119
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1604
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2774
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:79
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1718
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:599
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2573
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:98
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:320
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:480
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2049
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1945
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1291
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:721
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:1099
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:363
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2131
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1626
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1927
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2753
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:218
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:406
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2376
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:303
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1339
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:445
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1124
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1765
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2091
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2816
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1814
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2248
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2229
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1196
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1384
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:671
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2340
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1315
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2959
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2836
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2070
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:896
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1738
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2889
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:284
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2112
void _mm_setcsr(unsigned int __i)
Sets the MXCSR register with the 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:462
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:250
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition xmmintrin.h:747
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:3008
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2910
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2871
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1562
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1452
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1516
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2984
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:424
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2358
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1876
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1000
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:819
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2028
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:921
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:973
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1024
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2012
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:647
#define __trunc64(x)
Definition xmmintrin.h:51
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1243
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2498
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:796
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2151
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:1998
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:344
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:621
unsigned int _mm_getcsr(void)
Returns the contents of the MXCSR register as a 32-bit unsigned integer value.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:139
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:387
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:200
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1362
#define __zext128(x)
Definition xmmintrin.h:53
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1220
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1583
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1910
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2394
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:869
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1051
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:697
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1075
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1854
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1792
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:179