10#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512VLBF16INTRIN_H
16#define __AVX512VLBF16INTRIN_H
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512vl,avx512bf16"), \
21 __min_vector_width__(128)))
22#define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512vl,avx512bf16"), \
25 __min_vector_width__(256)))
27#if defined(__cplusplus) && (__cplusplus >= 201103L)
28#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
29#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
31#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
32#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
48_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
49 return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
71_mm_mask_cvtne2ps_pbh(__m128bh __W,
__mmask8 __U, __m128 __A, __m128 __B) {
72 return (__m128bh)__builtin_ia32_selectpbf_128((
__mmask8)__U,
73 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
93_mm_maskz_cvtne2ps_pbh(
__mmask8 __U, __m128 __A, __m128 __B) {
94 return (__m128bh)__builtin_ia32_selectpbf_128((
__mmask8)__U,
95 (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
112_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
113 return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
135_mm256_mask_cvtne2ps_pbh(__m256bh __W,
__mmask16 __U, __m256 __A, __m256 __B) {
136 return (__m256bh)__builtin_ia32_selectpbf_256((
__mmask16)__U,
137 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
157_mm256_maskz_cvtne2ps_pbh(
__mmask16 __U, __m256 __A, __m256 __B) {
158 return (__m256bh)__builtin_ia32_selectpbf_256((
__mmask16)__U,
159 (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
173#define _mm_cvtneps_pbh(A) \
174 ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
192_mm_mask_cvtneps_pbh(__m128bh __W,
__mmask8 __U, __m128 __A) {
193 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
212_mm_maskz_cvtneps_pbh(
__mmask8 __U, __m128 __A) {
213 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
227#define _mm256_cvtneps_pbh(A) \
228 ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
245_mm256_mask_cvtneps_pbh(__m128bh __W,
__mmask8 __U, __m256 __A) {
246 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
264_mm256_maskz_cvtneps_pbh(
__mmask8 __U, __m256 __A) {
265 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
285_mm_dpbf16_ps(__m128
__D, __m128bh __A, __m128bh __B) {
286 return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)
__D,
309_mm_mask_dpbf16_ps(__m128
__D,
__mmask8 __U, __m128bh __A, __m128bh __B) {
310 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
311 (__v4sf)_mm_dpbf16_ps(
__D, __A, __B),
333_mm_maskz_dpbf16_ps(
__mmask8 __U, __m128
__D, __m128bh __A, __m128bh __B) {
334 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
335 (__v4sf)_mm_dpbf16_ps(
__D, __A, __B),
354_mm256_dpbf16_ps(__m256
__D, __m256bh __A, __m256bh __B) {
355 return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)
__D,
378_mm256_mask_dpbf16_ps(__m256
__D,
__mmask8 __U, __m256bh __A, __m256bh __B) {
379 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
380 (__v8sf)_mm256_dpbf16_ps(
__D, __A, __B),
402_mm256_maskz_dpbf16_ps(
__mmask8 __U, __m256
__D, __m256bh __A, __m256bh __B) {
403 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
404 (__v8sf)_mm256_dpbf16_ps(
__D, __A, __B),
419 __v4sf __V = {__A, 0, 0, 0};
420 __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
422 return (__bf16)__R[0];
433_mm_cvtpbh_ps(__m128bh __A) {
435 (__m256) __builtin_convertvector(__A, __v8sf));
446_mm256_cvtpbh_ps(__m128bh __A) {
447 return (__m256) __builtin_convertvector(__A, __v8sf);
461_mm_maskz_cvtpbh_ps(
__mmask8 __U, __m128bh __A) {
462 return (__m128)__builtin_ia32_selectps_128(
477_mm256_maskz_cvtpbh_ps(
__mmask8 __U, __m128bh __A) {
478 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
479 (__v8sf)_mm256_cvtpbh_ps(__A),
497_mm_mask_cvtpbh_ps(__m128 __S,
__mmask8 __U, __m128bh __A) {
498 return (__m128)__builtin_ia32_selectps_128(
499 (
__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
516_mm256_mask_cvtpbh_ps(__m256 __S,
__mmask8 __U, __m128bh __A) {
517 return (__m256)__builtin_ia32_selectps_256(
518 (
__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
521#undef __DEFAULT_FN_ATTRS128
522#undef __DEFAULT_FN_ATTRS256
523#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
524#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a)
Returns the lower 128 bits of a 256-bit floating-point vector of [8 x float] as a 128-bit floating-po...
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
static __inline__ void short __D
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.