Macros
#define	__DEFAULT_FN_ATTRS
#define	__DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS

Functions
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR	_mm512_bmacor16x16x16 (__m512i __A, __m512i __B, __m512i __C)
	Multiplies two 16x16 bit matrices using OR reduction and ORs the product into a third 16x16 bit matrix (which is also the destination).
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR	_mm512_bmacxor16x16x16 (__m512i __A, __m512i __B, __m512i __C)
	Multiplies two 16x16 bit matrices using XOR reduction and XORs the product into a third 16x16 bit matrix (which is also the destination).
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR	_mm512_bitrev_epi8 (__m512i __A)
	Reverses the bits within each byte of the source vector.
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR	_mm512_mask_bitrev_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
	Reverses the bits within each byte of the source vector, using a writemask to conditionally select elements.
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR	_mm512_maskz_bitrev_epi8 (__mmask64 __U, __m512i __A)
	Reverses the bits within each byte of the source vector, zeroing elements based on the writemask.

Macro Definition Documentation

◆ __DEFAULT_FN_ATTRS

#define __DEFAULT_FN_ATTRS

Value:

__attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"), \

__min_vector_width__(512)))

__attribute__

_Float16 __2f16 __attribute__((ext_vector_type(2)))

Zeroes the upper 128 bits (bits 255:128) of all YMM registers.

Definition __clang_hip_libdevice_declares.h:214

Definition at line 19 of file avx512bmmintrin.h.

◆ __DEFAULT_FN_ATTRS_CONSTEXPR

#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS

Definition at line 26 of file avx512bmmintrin.h.

Referenced by _cvtmask16_u32(), _cvtmask32_u32(), _cvtmask8_u32(), _knot_mask32(), _mm256_add_ps(), _mm256_div_ps(), _mm256_hadd_ps(), _mm256_hsub_ps(), _mm256_max_ps(), _mm256_min_ps(), _mm256_mul_ps(), _mm256_setzero_pd(), _mm256_setzero_ps(), _mm256_sub_ps(), _mm256_testc_pd(), _mm256_testc_ps(), _mm256_testnzc_ps(), _mm256_testz_pd(), _mm256_testz_ps(), _mm512_kand(), _mm512_kmov(), _mm_abs_pi16(), _mm_abs_pi32(), _mm_abs_pi8(), _mm_add_pd(), _mm_add_sd(), _mm_add_si64(), _mm_and_pd(), _mm_cvt_si2ss(), _mm_cvtsi32_ss(), _mm_div_pd(), _mm_div_sd(), _mm_hadd_pi16(), _mm_hadd_pi32(), _mm_hadd_ps(), _mm_hadds_pi16(), _mm_hsub_pi16(), _mm_hsub_pi32(), _mm_hsub_ps(), _mm_hsubs_pi16(), _mm_max_pd(), _mm_max_ps(), _mm_max_sd(), _mm_max_ss(), _mm_min_pd(), _mm_min_ps(), _mm_min_sd(), _mm_min_ss(), _mm_movemask_ps(), _mm_mul_pd(), _mm_mul_sd(), _mm_mul_su32(), _mm_or_pd(), _mm_set1_epi32(), _mm_set1_epi8(), _mm_set1_pd(), _mm_set_epi32(), _mm_set_pd(), _mm_set_pd1(), _mm_set_sd(), _mm_setr_pd(), _mm_setzero_pd(), _mm_setzero_si128(), _mm_sign_pi16(), _mm_sign_pi32(), _mm_sign_pi8(), _mm_sub_pd(), _mm_sub_sd(), _mm_sub_si64(), and _mm_xor_pd().

Function Documentation

◆ _mm512_bitrev_epi8()

__inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bitrev_epi8 ( __m512i __A )

static

Reverses the bits within each byte of the source vector.

For each byte in the source, reverses the order of its 8 bits to generate the corresponding destination byte. For example, 0b10110001 becomes 0b10001101.

This intrinsic corresponds to the VBITREV instruction.

Parameters

__A	A 512-bit vector of [64 x i8] where each byte will have its bits reversed.

Returns: A 512-bit vector of [64 x i8] with bit-reversed bytes.

Definition at line 118 of file avx512bmmintrin.h.

Referenced by _mm512_mask_bitrev_epi8(), and _mm512_maskz_bitrev_epi8().

◆ _mm512_bmacor16x16x16()

__inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bmacor16x16x16	(	__m512i	__A,
		__m512i	__B,
		__m512i	__C )

static

Multiplies two 16x16 bit matrices using OR reduction and ORs the product into a third 16x16 bit matrix (which is also the destination).

For the 512-bit ZMM form, each register contains two 16x16 (256-bit) matrices in bits [255:0] and [511:256]. The operation performs:

for i in 0 to 15
  for j in 0 to 15
    reduction_bit = __C[16*i+j]
    for k in 0 to 15
      reduction_bit |= __A[16*i+k] & __B[16*k+j]
    end for k
    dest[16*i+j] = reduction_bit
  end for j
end for i

This intrinsic corresponds to the VBMACOR16X16X16 instruction.

Parameters

__A	A 512-bit vector containing two 16x16 bit matrices (one per 256-bit lane).
__B	A 512-bit vector containing two 16x16 bit matrices (one per 256-bit lane).
__C	A 512-bit accumulator vector containing the initial values to OR with.

Returns: A 512-bit vector containing the accumulated result for each lane.

Note: This instruction does not support masking.

Definition at line 61 of file avx512bmmintrin.h.

◆ _mm512_bmacxor16x16x16()

__inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bmacxor16x16x16	(	__m512i	__A,
		__m512i	__B,
		__m512i	__C )

static

Multiplies two 16x16 bit matrices using XOR reduction and XORs the product into a third 16x16 bit matrix (which is also the destination).

For the 512-bit ZMM form, each register contains two 16x16 (256-bit) matrices in bits [255:0] and [511:256]. The operation performs:

for i in 0 to 15
  for j in 0 to 15
    reduction_bit = __C[16*i+j]
    for k in 0 to 15
      reduction_bit ^= __A[16*i+k] & __B[16*k+j]
    end for k
    dest[16*i+j] = reduction_bit
  end for j
end for i

This intrinsic corresponds to the VBMACXOR16X16X16 instruction.

Parameters

__A	A 512-bit vector containing two 16x16 bit matrices (one per 256-bit lane).
__B	A 512-bit vector containing two 16x16 bit matrices (one per 256-bit lane).
__C	A 512-bit accumulator vector containing the initial values to XOR with.

Returns: A 512-bit vector containing the accumulated result for each lane.

Note: This instruction does not support masking.

Definition at line 98 of file avx512bmmintrin.h.

◆ _mm512_mask_bitrev_epi8()

__inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask_bitrev_epi8	(	__mmask64	__U,
		__m512i	__A,
		__m512i	__B )

static

Reverses the bits within each byte of the source vector, using a writemask to conditionally select elements.

For each byte position, if the corresponding mask bit is 1, the byte from A has its bits reversed and stored in the result. If the mask bit is 0, the corresponding byte from B is copied to the result (merge masking).

This intrinsic corresponds to the VBITREV instruction.

Parameters

__U	A 64-bit mask value where each bit controls one byte (per 8-bit element). A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
__A	A 512-bit vector of [64 x i8] to be bit-reversed.
__B	A 512-bit vector of [64 x i8] providing passthrough values.

Returns: A 512-bit vector combining bit-reversed and passthrough bytes.

Definition at line 142 of file avx512bmmintrin.h.

References _mm512_bitrev_epi8().

◆ _mm512_maskz_bitrev_epi8()

__inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_maskz_bitrev_epi8	(	__mmask64	__U,
		__m512i	__A )

static

Reverses the bits within each byte of the source vector, zeroing elements based on the writemask.

For each byte position, if the corresponding mask bit is 1, the byte from A has its bits reversed and stored in the result. If the mask bit is 0, the result byte is set to zero (zero masking).

This intrinsic corresponds to the VBITREV instruction.

Parameters

__U	A 64-bit mask value where each bit controls one byte (per 8-bit element). A 1 performs bit reversal; a 0 sets the byte to zero.
__A	A 512-bit vector of [64 x i8] to be bit-reversed.

Returns: A 512-bit vector with bit-reversed or zeroed bytes.

Definition at line 165 of file avx512bmmintrin.h.

References _mm512_bitrev_epi8(), and _mm512_setzero_si512().

Macros

Functions

Macro Definition Documentation

◆ __DEFAULT_FN_ATTRS

◆ __DEFAULT_FN_ATTRS_CONSTEXPR

Function Documentation

◆ _mm512_bitrev_epi8()

◆ _mm512_bmacor16x16x16()

◆ _mm512_bmacxor16x16x16()

◆ _mm512_mask_bitrev_epi8()

◆ _mm512_maskz_bitrev_epi8()