11 "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
16#ifndef __AVX10_2_512BF16INTRIN_H
17#define __AVX10_2_512BF16INTRIN_H
20typedef __bf16 __m512bh_u
__attribute__((__vector_size__(64), __aligned__(1)));
23#define __DEFAULT_FN_ATTRS512 \
24 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
25 __min_vector_width__(512)))
27#if defined(__cplusplus) && (__cplusplus >= 201103L)
28#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
30#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
38 return (__m512bh)__builtin_ia32_undef512();
42 return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
43 bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
44 bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
48 __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
49 __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
50 __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
51 __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
52 __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
53 __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
54 return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
55 bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
56 bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
57 bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1};
60#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
61 bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \
62 bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \
63 bf29, bf30, bf31, bf32) \
64 _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \
65 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \
66 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \
67 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \
71_mm512_castbf16_ps(__m512bh
__a) {
76_mm512_castbf16_pd(__m512bh
__a) {
81_mm512_castbf16_si512(__m512bh
__a) {
90_mm512_castpd_pbh(__m512d
__a) {
95_mm512_castsi512_pbh(__m512i
__a) {
100_mm512_castbf16512_pbh128(__m512bh
__a) {
101 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
105_mm512_castbf16512_pbh256(__m512bh
__a) {
106 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
111_mm512_castbf16128_pbh512(__m128bh
__a) {
112 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
113 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
114 -1, -1, -1, -1, -1, -1, -1, -1, -1);
118_mm512_castbf16256_pbh512(__m256bh
__a) {
119 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
120 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
121 -1, -1, -1, -1, -1, -1, -1, -1);
125_mm512_zextbf16128_pbh512(__m128bh
__a) {
126 return __builtin_shufflevector(
127 __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
128 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
132_mm512_zextbf16256_pbh512(__m256bh
__a) {
133 return __builtin_shufflevector(
__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
134 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
135 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
145_mm512_load_pbh(
void const *
__p) {
146 return *(
const __m512bh *)
__p;
150_mm512_loadu_pbh(
void const *
__p) {
154 return ((
const struct __loadu_pbh *)
__p)->__v;
159 *(__m512bh *)
__P = __A;
164 struct __storeu_pbh {
167 ((
struct __storeu_pbh *)
__P)->
__v = __A;
171_mm512_mask_blend_pbh(
__mmask32 __U, __m512bh __A, __m512bh __W) {
172 return (__m512bh)__builtin_ia32_selectpbf_512((
__mmask32)__U, (__v32bf)__W,
177_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
178 return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
183_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
184 return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
189 return (__m512bh)((__v32bf)__A + (__v32bf)__B);
193_mm512_mask_add_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
194 return (__m512bh)__builtin_ia32_selectpbf_512(
195 (
__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
199_mm512_maskz_add_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
200 return (__m512bh)__builtin_ia32_selectpbf_512(
201 (
__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
202 (__v32bf)_mm512_setzero_pbh());
207 return (__m512bh)((__v32bf)__A - (__v32bf)__B);
211_mm512_mask_sub_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
212 return (__m512bh)__builtin_ia32_selectpbf_512(
213 (
__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
217_mm512_maskz_sub_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
218 return (__m512bh)__builtin_ia32_selectpbf_512(
219 (
__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
220 (__v32bf)_mm512_setzero_pbh());
225 return (__m512bh)((__v32bf)__A * (__v32bf)__B);
229_mm512_mask_mul_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
230 return (__m512bh)__builtin_ia32_selectpbf_512(
231 (
__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
235_mm512_maskz_mul_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
236 return (__m512bh)__builtin_ia32_selectpbf_512(
237 (
__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
238 (__v32bf)_mm512_setzero_pbh());
243 return (__m512bh)((__v32bf)__A / (__v32bf)__B);
247_mm512_mask_div_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
248 return (__m512bh)__builtin_ia32_selectpbf_512(
249 (
__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
253_mm512_maskz_div_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
254 return (__m512bh)__builtin_ia32_selectpbf_512(
255 (
__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
256 (__v32bf)_mm512_setzero_pbh());
261 return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
265_mm512_mask_max_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
266 return (__m512bh)__builtin_ia32_selectpbf_512(
267 (
__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
271_mm512_maskz_max_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
272 return (__m512bh)__builtin_ia32_selectpbf_512(
273 (
__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
274 (__v32bf)_mm512_setzero_pbh());
279 return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
283_mm512_mask_min_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
284 return (__m512bh)__builtin_ia32_selectpbf_512(
285 (
__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
289_mm512_maskz_min_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
290 return (__m512bh)__builtin_ia32_selectpbf_512(
291 (
__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
292 (__v32bf)_mm512_setzero_pbh());
295#define _mm512_cmp_pbh_mask(__A, __B, __P) \
296 ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
297 (__v32bf)(__m512bh)(__B), \
298 (int)(__P), (__mmask32) - 1))
300#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \
301 ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
302 (__v32bf)(__m512bh)(__B), \
303 (int)(__P), (__mmask32)(__U)))
305#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \
306 ((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
307 (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
309#define _mm512_fpclass_pbh_mask(__A, imm) \
310 ((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
311 (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
314_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
315 return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
316 (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
321 __m512bh __W,
__mmask32 __U, __m512bh __A, __m512bh __B) {
322 return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
323 (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (
__mmask32)__U);
327_mm512_maskz_scalef_pbh(
__mmask32 __U, __m512bh __A, __m512bh __B) {
328 return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
329 (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
334 return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
335 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
339_mm512_mask_rcp_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
340 return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
345_mm512_maskz_rcp_pbh(
__mmask32 __U, __m512bh __A) {
346 return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
347 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
351_mm512_getexp_pbh(__m512bh __A) {
352 return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
353 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
357_mm512_mask_getexp_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
358 return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
359 (__v32bf)__A, (__v32bf)__W, (
__mmask32)__U);
363_mm512_maskz_getexp_pbh(
__mmask32 __U, __m512bh __A) {
364 return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
365 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
369_mm512_rsqrt_pbh(__m512bh __A) {
370 return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
371 (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (
__mmask32)-1);
375_mm512_mask_rsqrt_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
376 return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
381_mm512_maskz_rsqrt_pbh(
__mmask32 __U, __m512bh __A) {
382 return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
383 (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (
__mmask32)__U);
386#define _mm512_reduce_pbh(__A, imm) \
387 ((__m512bh)__builtin_ia32_vreducebf16512_mask( \
388 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \
391#define _mm512_mask_reduce_pbh(__W, __U, __A, imm) \
392 ((__m512bh)__builtin_ia32_vreducebf16512_mask( \
393 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
396#define _mm512_maskz_reduce_pbh(__U, __A, imm) \
397 ((__m512bh)__builtin_ia32_vreducebf16512_mask( \
398 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
401#define _mm512_roundscale_pbh(__A, imm) \
402 ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
403 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
406#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm) \
407 ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
408 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
411#define _mm512_maskz_roundscale_pbh(__U, __A, imm) \
412 ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
413 (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
416#define _mm512_getmant_pbh(__A, __B, __C) \
417 ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
418 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
419 (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
421#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \
422 ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
423 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
424 (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
426#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \
427 ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
428 (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
429 (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
432 return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
436_mm512_mask_sqrt_pbh(__m512bh __W,
__mmask32 __U, __m512bh __A) {
437 return (__m512bh)__builtin_ia32_selectpbf_512(
438 (
__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
442_mm512_maskz_sqrt_pbh(
__mmask32 __U, __m512bh __A) {
443 return (__m512bh)__builtin_ia32_selectpbf_512((
__mmask32)__U,
444 (__v32bf)_mm512_sqrt_pbh(__A),
445 (__v32bf)_mm512_setzero_pbh());
449_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
450 return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
455_mm512_mask_fmadd_pbh(__m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
456 return (__m512bh)__builtin_ia32_selectpbf_512(
458 _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
462 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
463 return (__m512bh)__builtin_ia32_selectpbf_512(
465 _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
469 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
470 return (__m512bh)__builtin_ia32_selectpbf_512(
472 _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
473 (__v32bf)_mm512_setzero_pbh());
477_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
478 return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B,
483_mm512_mask_fmsub_pbh(__m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
484 return (__m512bh)__builtin_ia32_selectpbf_512(
486 _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
490 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
491 return (__m512bh)__builtin_ia32_selectpbf_512(
493 _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
497 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
498 return (__m512bh)__builtin_ia32_selectpbf_512(
500 _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
501 (__v32bf)_mm512_setzero_pbh());
505_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
506 return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
511 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
512 return (__m512bh)__builtin_ia32_selectpbf_512(
514 _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
519 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
520 return (__m512bh)__builtin_ia32_selectpbf_512(
522 _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
527 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
528 return (__m512bh)__builtin_ia32_selectpbf_512(
530 _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
531 (__v32bf)_mm512_setzero_pbh());
535_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
536 return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B,
541 __m512bh __A,
__mmask32 __U, __m512bh __B, __m512bh __C) {
542 return (__m512bh)__builtin_ia32_selectpbf_512(
544 _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
549 __m512bh __A, __m512bh __B, __m512bh __C,
__mmask32 __U) {
550 return (__m512bh)__builtin_ia32_selectpbf_512(
552 _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
557 __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
558 return (__m512bh)__builtin_ia32_selectpbf_512(
560 _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
561 (__v32bf)_mm512_setzero_pbh());
564#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
565#undef __DEFAULT_FN_ATTRS512
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
static __inline__ void int __a
__inline unsigned int unsigned int unsigned int * __P