clang 23.0.0git
avx512bmmvlintrin.h
Go to the documentation of this file.
1/*===------------- avx512bmmvlintrin.h - BMM intrinsics ------------------===
2 *
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *
8 *===-----------------------------------------------------------------------===
9 */
10#ifndef __IMMINTRIN_H
11#error \
12 "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead."
13#endif
14
15#ifndef __BMMVLINTRIN_H
16#define __BMMVLINTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, \
21 __target__("avx512bmm,avx512vl"), __min_vector_width__(128)))
22#define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512bmm,avx512vl"), __min_vector_width__(256)))
25
26#if defined(__cplusplus) && (__cplusplus >= 201103L)
27#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
28#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
29#else
30#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
31#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
32#endif
33
34/// Multiplies two 16x16 bit matrices using OR reduction and ORs the product
35/// into a third 16x16 bit matrix (which is also the destination).
36///
37/// For the 256-bit YMM form, the source registers/memory each contain a single
38/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs:
39/// \code{.operation}
40/// for i in 0 to 15
41/// for j in 0 to 15
42/// reduction_bit = __C[16*i+j]
43/// for k in 0 to 15
44/// reduction_bit |= __A[16*i+k] & __B[16*k+j]
45/// end for k
46/// dest[16*i+j] = reduction_bit
47/// end for j
48/// end for i
49/// \endcode
50///
51/// \headerfile <immintrin.h>
52///
53/// This intrinsic corresponds to the <c> VBMACOR16X16X16 </c> instruction.
54///
55/// \param __A
56/// A 256-bit vector containing a 16x16 bit matrix.
57/// \param __B
58/// A 256-bit vector containing a 16x16 bit matrix.
59/// \param __C
60/// A 256-bit accumulator vector containing the initial values to OR with.
61/// \returns A 256-bit vector containing the accumulated result.
62/// \note This instruction does not support masking.
63static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
64_mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C) {
65 return (__m256i)__builtin_ia32_bmacor16x16x16_v16hi(
66 (__v16hi)__A, (__v16hi)__B, (__v16hi)__C);
67}
68
69/// Multiplies two 16x16 bit matrices using XOR reduction and XORs the product
70/// into a third 16x16 bit matrix (which is also the destination).
71///
72/// For the 256-bit YMM form, the source registers/memory each contain a single
73/// 16x16 (256-bit) matrix in bits [255:0]. The operation performs:
74/// \code{.operation}
75/// for i in 0 to 15
76/// for j in 0 to 15
77/// reduction_bit = __C[16*i+j]
78/// for k in 0 to 15
79/// reduction_bit ^= __A[16*i+k] & __B[16*k+j]
80/// end for k
81/// dest[16*i+j] = reduction_bit
82/// end for j
83/// end for i
84/// \endcode
85///
86/// \headerfile <immintrin.h>
87///
88/// This intrinsic corresponds to the <c> VBMACXOR16X16X16 </c> instruction.
89///
90/// \param __A
91/// A 256-bit vector containing a 16x16 bit matrix.
92/// \param __B
93/// A 256-bit vector containing a 16x16 bit matrix.
94/// \param __C
95/// A 256-bit accumulator vector containing the initial values to XOR with.
96/// \returns A 256-bit vector containing the accumulated result.
97/// \note This instruction does not support masking.
98static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
99_mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C) {
100 return (__m256i)__builtin_ia32_bmacxor16x16x16_v16hi(
101 (__v16hi)__A, (__v16hi)__B, (__v16hi)__C);
102}
103
104/// Reverses the bits within each byte of the source vector.
105///
106/// For each byte in the source, reverses the order of its 8 bits to generate
107/// the corresponding destination byte. For example, 0b10110001 becomes
108/// 0b10001101.
109///
110/// \headerfile <immintrin.h>
111///
112/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
113///
114/// \param __A
115/// A 128-bit vector of [16 x i8] where each byte will have its bits
116/// reversed.
117/// \returns A 128-bit vector of [16 x i8] with bit-reversed bytes.
118static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
119_mm128_bitrev_epi8(__m128i __A) {
120 return (__m128i)__builtin_elementwise_bitreverse((__v16qi)__A);
121}
122
123/// Reverses the bits within each byte of the source vector.
124///
125/// For each byte in the source, reverses the order of its 8 bits to generate
126/// the corresponding destination byte. For example, 0b10110001 becomes
127/// 0b10001101.
128///
129/// \headerfile <immintrin.h>
130///
131/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
132///
133/// \param __A
134/// A 256-bit vector of [32 x i8] where each byte will have its bits
135/// reversed.
136/// \returns A 256-bit vector of [32 x i8] with bit-reversed bytes.
137static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
138_mm256_bitrev_epi8(__m256i __A) {
139 return (__m256i)__builtin_elementwise_bitreverse((__v32qi)__A);
140}
141
142/// Reverses the bits within each byte of the source vector, using a writemask
143/// to conditionally select elements.
144///
145/// For each byte position, if the corresponding mask bit is 1, the byte from
146/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
147/// the corresponding byte from \a B is copied to the result (merge masking).
148///
149/// \headerfile <immintrin.h>
150///
151/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
152///
153/// \param __U
154/// A 16-bit mask value where each bit controls one byte (per 8-bit element).
155/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
156/// \param __A
157/// A 128-bit vector of [16 x i8] to be bit-reversed.
158/// \param __B
159/// A 128-bit vector of [16 x i8] providing passthrough values.
160/// \returns A 128-bit vector combining bit-reversed and passthrough bytes.
161static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
162_mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
163 return (__m128i)__builtin_ia32_selectb_128(
164 (__mmask16)__U, (__v16qi)_mm128_bitrev_epi8(__A), (__v16qi)__B);
165}
166
167/// Reverses the bits within each byte of the source vector, using a writemask
168/// to conditionally select elements.
169///
170/// For each byte position, if the corresponding mask bit is 1, the byte from
171/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
172/// the corresponding byte from \a B is copied to the result (merge masking).
173///
174/// \headerfile <immintrin.h>
175///
176/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
177///
178/// \param __U
179/// A 32-bit mask value where each bit controls one byte (per 8-bit element).
180/// A 1 performs bit reversal; a 0 selects the passthrough byte from __B.
181/// \param __A
182/// A 256-bit vector of [32 x i8] to be bit-reversed.
183/// \param __B
184/// A 256-bit vector of [32 x i8] providing passthrough values.
185/// \returns A 256-bit vector combining bit-reversed and passthrough bytes.
186static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
187_mm256_mask_bitrev_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
188 return (__m256i)__builtin_ia32_selectb_256(
189 (__mmask32)__U, (__v32qi)_mm256_bitrev_epi8(__A), (__v32qi)__B);
190}
191
192/// Reverses the bits within each byte of the source vector, zeroing elements
193/// based on the writemask.
194///
195/// For each byte position, if the corresponding mask bit is 1, the byte from
196/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
197/// the result byte is set to zero (zero masking).
198///
199/// \headerfile <immintrin.h>
200///
201/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
202///
203/// \param __U
204/// A 16-bit mask value where each bit controls one byte (per 8-bit element).
205/// A 1 performs bit reversal; a 0 sets the byte to zero.
206/// \param __A
207/// A 128-bit vector of [16 x i8] to be bit-reversed.
208/// \returns A 128-bit vector with bit-reversed or zeroed bytes.
209static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
211 return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
212 (__v16qi)_mm128_bitrev_epi8(__A),
213 (__v16qi)_mm_setzero_si128());
214}
215
216/// Reverses the bits within each byte of the source vector, zeroing elements
217/// based on the writemask.
218///
219/// For each byte position, if the corresponding mask bit is 1, the byte from
220/// \a A has its bits reversed and stored in the result. If the mask bit is 0,
221/// the result byte is set to zero (zero masking).
222///
223/// \headerfile <immintrin.h>
224///
225/// This intrinsic corresponds to the <c> VBITREV </c> instruction.
226///
227/// \param __U
228/// A 32-bit mask value where each bit controls one byte (per 8-bit element).
229/// A 1 performs bit reversal; a 0 sets the byte to zero.
230/// \param __A
231/// A 256-bit vector of [32 x i8] to be bit-reversed.
232/// \returns A 256-bit vector with bit-reversed or zeroed bytes.
233static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
235 return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
236 (__v32qi)_mm256_bitrev_epi8(__A),
237 (__v32qi)_mm256_setzero_si256());
238}
239
240#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
241#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
242#undef __DEFAULT_FN_ATTRS128
243#undef __DEFAULT_FN_ATTRS256
244
245#endif
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm128_maskz_bitrev_epi8(__mmask16 __U, __m128i __A)
Reverses the bits within each byte of the source vector, zeroing elements based on the writemask.
static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm128_mask_bitrev_epi8(__mmask16 __U, __m128i __A, __m128i __B)
Reverses the bits within each byte of the source vector, using a writemask to conditionally select el...
static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_bitrev_epi8(__m256i __A)
Reverses the bits within each byte of the source vector.
static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_bitrev_epi8(__mmask32 __U, __m256i __A)
Reverses the bits within each byte of the source vector, zeroing elements based on the writemask.
static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_bmacxor16x16x16(__m256i __A, __m256i __B, __m256i __C)
Multiplies two 16x16 bit matrices using XOR reduction and XORs the product into a third 16x16 bit mat...
static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_bitrev_epi8(__mmask32 __U, __m256i __A, __m256i __B)
Reverses the bits within each byte of the source vector, using a writemask to conditionally select el...
static __inline __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm128_bitrev_epi8(__m128i __A)
Reverses the bits within each byte of the source vector.
static __inline __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_bmacor16x16x16(__m256i __A, __m256i __B, __m256i __C)
Multiplies two 16x16 bit matrices using OR reduction and ORs the product into a third 16x16 bit matri...
unsigned int __mmask32
unsigned short __mmask16
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4299
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878