11#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, __target__("fma"), \
20 __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, __target__("fma"), \
23 __min_vector_width__(256)))
25#if defined(__cplusplus) && (__cplusplus >= 201103L)
26#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
27#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
29#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
30#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
50 return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
71 return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
101 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
130 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
150 return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B,
171 return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B,
201 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
230 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
250 return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
271 return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
301 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
330 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
350 return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B,
371 return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B,
401 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
430 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
457 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
482 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
509 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
534 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
554 return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
575 return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
596 return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B,
617 return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B,
638 return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
659 return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
680 return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B,
701 return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B,
733 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
760 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
791 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
818 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
821#undef __DEFAULT_FN_ATTRS128
822#undef __DEFAULT_FN_ATTRS256
823#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
824#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-add of 128-bit vectors of [4 x float].
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [8 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-add of 128-bit vectors of [4 x float].
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-add of 128-bit vectors of [2 x double].
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [8 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [4 x float].
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply with alternating add/subtract of 256-bit vectors of [4 x double].
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-subtract of 256-bit vectors of [4 x double].
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-subtract of 256-bit vectors of [8 x float].
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-subtract of the single-precision values in the low 32 bits of 128-bit vect...
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-subtract of the double-precision values in the low 64 bits of 128-bit vect...
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-subtract of the single-precision values in the low 32 bits of 128-...
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar multiply-add of the double-precision values in the low 64 bits of 128-bit vectors o...
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply with alternating add/subtract of 128-bit vectors of [2 x double].
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a multiply-subtract of 128-bit vectors of [4 x float].
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-add of 256-bit vectors of [4 x double].
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a multiply-add of 256-bit vectors of [8 x float].
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a vector multiply with alternating add/subtract of 256-bit vectors of [4 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-add of the double-precision values in the low 64 bits of 128-bit v...
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-add of 128-bit vectors of [2 x double].
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
Computes a multiply-add of 256-bit vectors of [4 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
Computes a scalar negated multiply-subtract of the double-precision values in the low 64 bits of 128-...
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
Computes a negated multiply-add of 256-bit vectors of [8 x float].
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a multiply-subtract of 128-bit vectors of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar negated multiply-add of the single-precision values in the low 32 bits of 128-bit v...
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
Computes a scalar multiply-add of the single-precision values in the low 32 bits of 128-bit vectors o...