clang 23.0.0git
avx512bmmintrin.h
Go to the documentation of this file.
1/*===-------- avx512bmmintrin.h - AVX512BMM intrinsics *------------------===
2 *
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *
8 *===---------------------------------------------------------------------===
9 */
10
11#ifndef __IMMINTRIN_H
12#error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead."
13#endif
14
15#ifndef _AVX512BMMINTRIN_H
16#define _AVX512BMMINTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS \
20 __attribute__((__always_inline__, __nodebug__, __target__("avx512bmm"), \
21 __min_vector_width__(512)))
22
23#if defined(__cplusplus) && (__cplusplus >= 201103L)
24#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
25#else
26#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
27#endif
28
29/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product
30/// into a third 16x16 bit matrix (which is also the destination).
31///
32/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
33/// matrices in bits [255:0] and [511:256]. The operation performs:
34/// \code{.operation}
35/// for i in 0 to 15
36/// for j in 0 to 15
37/// reduction_bit = __C[16*i+j]
38/// for k in 0 to 15
39/// reduction_bit |= __A[16*i+k] & __B[16*k+j]
40/// end for k
41/// dest[16*i+j] = reduction_bit
42/// end for j
43/// end for i
44/// \endcode
45///
46/// \headerfile <immintrin.h>
47///
48/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction.
49///
50/// \param __A
51/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
52/// lane).
53/// \param __B
54/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
55/// lane).
56/// \param __C
57/// A 512-bit accumulator vector containing the initial values to OR with.
58/// \returns A 512-bit vector containing the accumulated result for each lane.
59/// \note This instruction does not support masking.
60static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
61_mm512_bmacor16x16x16(__m512i __A, __m512i __B, __m512i __C) {
62 return (__m512i)__builtin_ia32_bmacor16x16x16_v32hi(
63 (__v32hi)__A, (__v32hi)__B, (__v32hi)__C);
64}
65
66/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product
67/// into a third 16x16 bit matrix (which is also the destination).
68///
69/// For the 512-bit ZMM form, each register contains two 16x16 (256-bit)
70/// matrices in bits [255:0] and [511:256]. The operation performs:
71/// \code{.operation}
72/// for i in 0 to 15
73/// for j in 0 to 15
74/// reduction_bit = __C[16*i+j]
75/// for k in 0 to 15
76/// reduction_bit ^= __A[16*i+k] & __B[16*k+j]
77/// end for k
78/// dest[16*i+j] = reduction_bit
79/// end for j
80/// end for i
81/// \endcode
82///
83/// \headerfile <immintrin.h>
84///
85/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction.
86///
87/// \param __A
88/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
89/// lane).
90/// \param __B
91/// A 512-bit vector containing two 16x16 bit matrices (one per 256-bit
92/// lane).
93/// \param __C
94/// A 512-bit accumulator vector containing the initial values to XOR with.
95/// \returns A 512-bit vector containing the accumulated result for each lane.
96/// \note This instruction does not support masking.
97static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
98_mm512_bmacxor16x16x16(__m512i __A, __m512i __B, __m512i __C) {
99 return (__m512i)__builtin_ia32_bmacxor16x16x16_v32hi(
100 (__v32hi)__A, (__v32hi)__B, (__v32hi)__C);
101}
102
103/// Reverses the bits within each byte of the source vector.
104///
105/// For each byte in the source, reverses the order of its 8 bits to generate
106/// the corresponding destination byte. For example, 0b10110001 becomes
107/// 0b10001101.
108///
109/// \headerfile <immintrin.h>
110///
111/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
112///
113/// \param __A
114/// A 512-bit vector of [64 x i8] where each byte will have its bits
115/// reversed.
116/// \returns A 512-bit vector of [64 x i8] with bit-reversed bytes.
117static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
118_mm512_bitrev_epi8(__m512i __A) {
119 return (__m512i)__builtin_elementwise_bitreverse((__v64qi)__A);
120}
121
122/// Reverses the bits within each byte of the source vector, using a writemask
123/// to conditionally select elements.
124///
125/// For each byte position, if the corresponding mask bit is 1, the byte from
126/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
127/// the corresponding byte from \a B is copied to the result (merge masking).
128///
129/// \headerfile <immintrin.h>
130///
131/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
132///
133/// \param __U
134/// A 64-bit mask value where each bit controls one byte (per 8-bit element).
135/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
136/// \param __A
137/// A 512-bit vector of [64 x i8] to be bit-reversed.
138/// \param __B
139/// A 512-bit vector of [64 x i8] providing passthrough values.
140/// \returns A 512-bit vector combining bit-reversed and passthrough bytes.
141static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
142_mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
143 return (__m512i)__builtin_ia32_selectb_512(
144 (__mmask64)__U, (__v64qi)_mm512_bitrev_epi8(__A), (__v64qi)__B);
145}
146
147/// Reverses the bits within each byte of the source vector, zeroing elements
148/// based on the writemask.
149///
150/// For each byte position, if the corresponding mask bit is 1, the byte from
151/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
152/// the result byte is set to zero (zero masking).
153///
154/// \headerfile <immintrin.h>
155///
156/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
157///
158/// \param __U
159/// A 64-bit mask value where each bit controls one byte (per 8-bit element).
160/// A 1 performs bit reversal; a 0 sets the byte to zero.
161/// \param __A
162/// A 512-bit vector of [64 x i8] to be bit-reversed.
163/// \returns A 512-bit vector with bit-reversed or zeroed bytes.
164static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
166 return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
167 (__v64qi)_mm512_bitrev_epi8(__A),
168 (__v64qi)_mm512_setzero_si512());
169}
170
171#undef __DEFAULT_FN_ATTRS
172#undef __DEFAULT_FN_ATTRS_CONSTEXPR
173
174#endif
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bmacxor16x16x16(__m512i __A, __m512i __B, __m512i __C)
Multiplies two 16x16 bit matrices using XOR reduction and XORs the product into a third 16x16 bit mat...
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bmacor16x16x16(__m512i __A, __m512i __B, __m512i __C)
Multiplies two 16x16 bit matrices using OR reduction and ORs the product into a third 16x16 bit matri...
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_bitrev_epi8(__m512i __A)
Reverses the bits within each byte of the source vector.
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask_bitrev_epi8(__mmask64 __U, __m512i __A, __m512i __B)
Reverses the bits within each byte of the source vector, using a writemask to conditionally select el...
static __inline __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_maskz_bitrev_epi8(__mmask64 __U, __m512i __A)
Reverses the bits within each byte of the source vector, zeroing elements based on the writemask.
unsigned long long __mmask64
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_si512(void)