10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
49 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
53_mm256_setzero_ph(
void) {
54 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
59 return (__m256h)__builtin_ia32_undef256();
63_mm512_setzero_ph(
void) {
64 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
65 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
70 return (__m128h)__builtin_ia32_undef128();
74 return (__m512h)__builtin_ia32_undef512();
79 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
80 __h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h};
94 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
95 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
96 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
97 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
98 __h4, __h3, __h2, __h1};
109 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
110 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
111 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
115_mm512_set1_pch(
_Float16 _Complex __h) {
148_mm256_castph_si256(__m256h
__a) {
153_mm512_castph_si512(__m512h
__a) {
186_mm256_castsi256_ph(__m256i
__a) {
191_mm512_castsi512_ph(__m512i
__a) {
196_mm256_castph256_ph128(__m256h
__a) {
197 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
201_mm512_castph512_ph128(__m512h
__a) {
202 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
206_mm512_castph512_ph256(__m512h
__a) {
207 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
212_mm256_castph128_ph256(__m128h
__a) {
213 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
214 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
218_mm512_castph128_ph512(__m128h
__a) {
219 __m256h
__b = __builtin_nondeterministic_value(
__b);
220 return __builtin_shufflevector(
221 __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
222 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
223 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
224 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
228_mm512_castph256_ph512(__m256h
__a) {
229 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
230 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
231 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
249_mm256_zextph128_ph256(__m128h
__a) {
250 return __builtin_shufflevector(
__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
251 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
268_mm512_zextph128_ph512(__m128h
__a) {
269 return __builtin_shufflevector(
270 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
271 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
288_mm512_zextph256_ph512(__m256h
__a) {
289 return __builtin_shufflevector(
__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
290 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
291 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
295#define _mm_comi_round_sh(A, B, P, R) \
296 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
298#define _mm_comi_sh(A, B, pred) \
299 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
303 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OS,
309 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OS,
315 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OS,
321 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OS,
327 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OS,
333 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_US,
339 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OQ,
345 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OQ,
351 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OQ,
357 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OQ,
363 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OQ,
369 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_UQ,
375 return (__m512h)((__v32hf)__A + (__v32hf)__B);
379_mm512_mask_add_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
380 return (__m512h)__builtin_ia32_selectph_512(
381 (
__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
385_mm512_maskz_add_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
386 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
387 (__v32hf)_mm512_add_ph(__A, __B),
388 (__v32hf)_mm512_setzero_ph());
391#define _mm512_add_round_ph(A, B, R) \
392 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
393 (__v32hf)(__m512h)(B), (int)(R)))
395#define _mm512_mask_add_round_ph(W, U, A, B, R) \
396 ((__m512h)__builtin_ia32_selectph_512( \
397 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
398 (__v32hf)(__m512h)(W)))
400#define _mm512_maskz_add_round_ph(U, A, B, R) \
401 ((__m512h)__builtin_ia32_selectph_512( \
402 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
403 (__v32hf)_mm512_setzero_ph()))
407 return (__m512h)((__v32hf)__A - (__v32hf)__B);
411_mm512_mask_sub_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
412 return (__m512h)__builtin_ia32_selectph_512(
413 (
__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
417_mm512_maskz_sub_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
418 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
419 (__v32hf)_mm512_sub_ph(__A, __B),
420 (__v32hf)_mm512_setzero_ph());
423#define _mm512_sub_round_ph(A, B, R) \
424 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
425 (__v32hf)(__m512h)(B), (int)(R)))
427#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
428 ((__m512h)__builtin_ia32_selectph_512( \
429 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
430 (__v32hf)(__m512h)(W)))
432#define _mm512_maskz_sub_round_ph(U, A, B, R) \
433 ((__m512h)__builtin_ia32_selectph_512( \
434 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
435 (__v32hf)_mm512_setzero_ph()))
439 return (__m512h)((__v32hf)__A * (__v32hf)__B);
443_mm512_mask_mul_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
444 return (__m512h)__builtin_ia32_selectph_512(
445 (
__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
449_mm512_maskz_mul_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
450 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
451 (__v32hf)_mm512_mul_ph(__A, __B),
452 (__v32hf)_mm512_setzero_ph());
455#define _mm512_mul_round_ph(A, B, R) \
456 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
457 (__v32hf)(__m512h)(B), (int)(R)))
459#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
460 ((__m512h)__builtin_ia32_selectph_512( \
461 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
462 (__v32hf)(__m512h)(W)))
464#define _mm512_maskz_mul_round_ph(U, A, B, R) \
465 ((__m512h)__builtin_ia32_selectph_512( \
466 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
467 (__v32hf)_mm512_setzero_ph()))
471 return (__m512h)((__v32hf)__A / (__v32hf)__B);
475_mm512_mask_div_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
476 return (__m512h)__builtin_ia32_selectph_512(
477 (
__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
481_mm512_maskz_div_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
482 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
483 (__v32hf)_mm512_div_ph(__A, __B),
484 (__v32hf)_mm512_setzero_ph());
487#define _mm512_div_round_ph(A, B, R) \
488 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
489 (__v32hf)(__m512h)(B), (int)(R)))
491#define _mm512_mask_div_round_ph(W, U, A, B, R) \
492 ((__m512h)__builtin_ia32_selectph_512( \
493 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
494 (__v32hf)(__m512h)(W)))
496#define _mm512_maskz_div_round_ph(U, A, B, R) \
497 ((__m512h)__builtin_ia32_selectph_512( \
498 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
499 (__v32hf)_mm512_setzero_ph()))
503 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
508_mm512_mask_min_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
509 return (__m512h)__builtin_ia32_selectph_512(
510 (
__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
514_mm512_maskz_min_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
515 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
516 (__v32hf)_mm512_min_ph(__A, __B),
517 (__v32hf)_mm512_setzero_ph());
520#define _mm512_min_round_ph(A, B, R) \
521 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
522 (__v32hf)(__m512h)(B), (int)(R)))
524#define _mm512_mask_min_round_ph(W, U, A, B, R) \
525 ((__m512h)__builtin_ia32_selectph_512( \
526 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
527 (__v32hf)(__m512h)(W)))
529#define _mm512_maskz_min_round_ph(U, A, B, R) \
530 ((__m512h)__builtin_ia32_selectph_512( \
531 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
532 (__v32hf)_mm512_setzero_ph()))
536 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
541_mm512_mask_max_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
542 return (__m512h)__builtin_ia32_selectph_512(
543 (
__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
547_mm512_maskz_max_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
548 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
549 (__v32hf)_mm512_max_ph(__A, __B),
550 (__v32hf)_mm512_setzero_ph());
553#define _mm512_max_round_ph(A, B, R) \
554 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
555 (__v32hf)(__m512h)(B), (int)(R)))
557#define _mm512_mask_max_round_ph(W, U, A, B, R) \
558 ((__m512h)__builtin_ia32_selectph_512( \
559 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
560 (__v32hf)(__m512h)(W)))
562#define _mm512_maskz_max_round_ph(U, A, B, R) \
563 ((__m512h)__builtin_ia32_selectph_512( \
564 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
565 (__v32hf)_mm512_setzero_ph()))
568_mm512_abs_ph(__m512h __A) {
578_mm512_mask_conj_pch(__m512h __W,
__mmask16 __U, __m512h __A) {
579 return (__m512h)__builtin_ia32_selectps_512(
580 (
__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
584_mm512_maskz_conj_pch(
__mmask16 __U, __m512h __A) {
585 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
586 (__v16sf)_mm512_conj_pch(__A),
600 __A = _mm_add_sh(__A, __B);
601 return __builtin_ia32_selectsh_128(__U, __A, __W);
607 __A = _mm_add_sh(__A, __B);
608 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
611#define _mm_add_round_sh(A, B, R) \
612 ((__m128h)__builtin_ia32_addsh_round_mask( \
613 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
614 (__mmask8)-1, (int)(R)))
616#define _mm_mask_add_round_sh(W, U, A, B, R) \
617 ((__m128h)__builtin_ia32_addsh_round_mask( \
618 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
619 (__mmask8)(U), (int)(R)))
621#define _mm_maskz_add_round_sh(U, A, B, R) \
622 ((__m128h)__builtin_ia32_addsh_round_mask( \
623 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
624 (__mmask8)(U), (int)(R)))
636 __A = _mm_sub_sh(__A, __B);
637 return __builtin_ia32_selectsh_128(__U, __A, __W);
643 __A = _mm_sub_sh(__A, __B);
644 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
647#define _mm_sub_round_sh(A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
650 (__mmask8)-1, (int)(R)))
652#define _mm_mask_sub_round_sh(W, U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
655 (__mmask8)(U), (int)(R)))
657#define _mm_maskz_sub_round_sh(U, A, B, R) \
658 ((__m128h)__builtin_ia32_subsh_round_mask( \
659 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
660 (__mmask8)(U), (int)(R)))
672 __A = _mm_mul_sh(__A, __B);
673 return __builtin_ia32_selectsh_128(__U, __A, __W);
679 __A = _mm_mul_sh(__A, __B);
680 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
683#define _mm_mul_round_sh(A, B, R) \
684 ((__m128h)__builtin_ia32_mulsh_round_mask( \
685 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
686 (__mmask8)-1, (int)(R)))
688#define _mm_mask_mul_round_sh(W, U, A, B, R) \
689 ((__m128h)__builtin_ia32_mulsh_round_mask( \
690 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
691 (__mmask8)(U), (int)(R)))
693#define _mm_maskz_mul_round_sh(U, A, B, R) \
694 ((__m128h)__builtin_ia32_mulsh_round_mask( \
695 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
696 (__mmask8)(U), (int)(R)))
708 __A = _mm_div_sh(__A, __B);
709 return __builtin_ia32_selectsh_128(__U, __A, __W);
715 __A = _mm_div_sh(__A, __B);
716 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
719#define _mm_div_round_sh(A, B, R) \
720 ((__m128h)__builtin_ia32_divsh_round_mask( \
721 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
722 (__mmask8)-1, (int)(R)))
724#define _mm_mask_div_round_sh(W, U, A, B, R) \
725 ((__m128h)__builtin_ia32_divsh_round_mask( \
726 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
727 (__mmask8)(U), (int)(R)))
729#define _mm_maskz_div_round_sh(U, A, B, R) \
730 ((__m128h)__builtin_ia32_divsh_round_mask( \
731 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
732 (__mmask8)(U), (int)(R)))
736 return (__m128h)__builtin_ia32_minsh_round_mask(
737 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
745 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
753 return (__m128h)__builtin_ia32_minsh_round_mask(
754 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
758#define _mm_min_round_sh(A, B, R) \
759 ((__m128h)__builtin_ia32_minsh_round_mask( \
760 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
761 (__mmask8)-1, (int)(R)))
763#define _mm_mask_min_round_sh(W, U, A, B, R) \
764 ((__m128h)__builtin_ia32_minsh_round_mask( \
765 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
766 (__mmask8)(U), (int)(R)))
768#define _mm_maskz_min_round_sh(U, A, B, R) \
769 ((__m128h)__builtin_ia32_minsh_round_mask( \
770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
771 (__mmask8)(U), (int)(R)))
775 return (__m128h)__builtin_ia32_maxsh_round_mask(
776 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
784 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
792 return (__m128h)__builtin_ia32_maxsh_round_mask(
793 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
797#define _mm_max_round_sh(A, B, R) \
798 ((__m128h)__builtin_ia32_maxsh_round_mask( \
799 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
800 (__mmask8)-1, (int)(R)))
802#define _mm_mask_max_round_sh(W, U, A, B, R) \
803 ((__m128h)__builtin_ia32_maxsh_round_mask( \
804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
805 (__mmask8)(U), (int)(R)))
807#define _mm_maskz_max_round_sh(U, A, B, R) \
808 ((__m128h)__builtin_ia32_maxsh_round_mask( \
809 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
810 (__mmask8)(U), (int)(R)))
812#define _mm512_cmp_round_ph_mask(A, B, P, R) \
813 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
814 (__v32hf)(__m512h)(B), (int)(P), \
815 (__mmask32)-1, (int)(R)))
817#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
818 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
819 (__v32hf)(__m512h)(B), (int)(P), \
820 (__mmask32)(U), (int)(R)))
822#define _mm512_cmp_ph_mask(A, B, P) \
823 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
825#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
826 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
828#define _mm_cmp_round_sh_mask(X, Y, P, R) \
829 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
830 (__v8hf)(__m128h)(Y), (int)(P), \
831 (__mmask8)-1, (int)(R)))
833#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
834 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
835 (__v8hf)(__m128h)(Y), (int)(P), \
836 (__mmask8)(M), (int)(R)))
838#define _mm_cmp_sh_mask(X, Y, P) \
839 ((__mmask8)__builtin_ia32_cmpsh_mask( \
840 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
841 _MM_FROUND_CUR_DIRECTION))
843#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
844 ((__mmask8)__builtin_ia32_cmpsh_mask( \
845 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
846 _MM_FROUND_CUR_DIRECTION))
849 struct __mm_load_sh_struct {
852 _Float16 __u = ((
const struct __mm_load_sh_struct *)__dp)->__u;
853 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
857_mm_mask_load_sh(__m128h __W,
__mmask8 __U,
const void *__A) {
858 __m128h src = (__v8hf)__builtin_shufflevector(
859 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
861 return (__m128h)__builtin_ia32_loadsh128_mask((
const __v8hf *)__A, src, __U & 1);
865_mm_maskz_load_sh(
__mmask8 __U,
const void *__A) {
866 return (__m128h)__builtin_ia32_loadsh128_mask(
867 (
const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
871_mm512_load_ph(
void const *
__p) {
872 return *(
const __m512h *)
__p;
876_mm256_load_ph(
void const *
__p) {
877 return *(
const __m256h *)
__p;
881 return *(
const __m128h *)
__p;
885_mm512_loadu_ph(
void const *
__p) {
889 return ((
const struct __loadu_ph *)
__p)->__v;
893_mm256_loadu_ph(
void const *
__p) {
897 return ((
const struct __loadu_ph *)
__p)->__v;
904 return ((
const struct __loadu_ph *)
__p)->__v;
910 struct __mm_store_sh_struct {
913 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
919 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
924 *(__m512h *)
__P = __A;
929 *(__m256h *)
__P = __A;
934 *(__m128h *)
__P = __A;
942 ((
struct __storeu_ph *)
__P)->
__v = __A;
950 ((
struct __storeu_ph *)
__P)->
__v = __A;
958 ((
struct __storeu_ph *)
__P)->
__v = __A;
972 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
978 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
984 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
993 return (__m512h)__builtin_ia32_rcpph512_mask(
994 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
998_mm512_mask_rcp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
999 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
1004_mm512_maskz_rcp_ph(
__mmask32 __U, __m512h __A) {
1005 return (__m512h)__builtin_ia32_rcpph512_mask(
1006 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1010 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1011 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
1015_mm512_mask_rsqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1016 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1021_mm512_maskz_rsqrt_ph(
__mmask32 __U, __m512h __A) {
1022 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1023 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1026#define _mm512_getmant_ph(A, B, C) \
1027 ((__m512h)__builtin_ia32_getmantph512_mask( \
1028 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1029 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1030 _MM_FROUND_CUR_DIRECTION))
1032#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1035 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1037#define _mm512_maskz_getmant_ph(U, A, B, C) \
1038 ((__m512h)__builtin_ia32_getmantph512_mask( \
1039 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1040 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1042#define _mm512_getmant_round_ph(A, B, C, R) \
1043 ((__m512h)__builtin_ia32_getmantph512_mask( \
1044 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1045 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1047#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1048 ((__m512h)__builtin_ia32_getmantph512_mask( \
1049 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1050 (__mmask32)(U), (int)(R)))
1052#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1053 ((__m512h)__builtin_ia32_getmantph512_mask( \
1054 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1055 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1058 return (__m512h)__builtin_ia32_getexpph512_mask(
1059 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1064_mm512_mask_getexp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1065 return (__m512h)__builtin_ia32_getexpph512_mask(
1070_mm512_maskz_getexp_ph(
__mmask32 __U, __m512h __A) {
1071 return (__m512h)__builtin_ia32_getexpph512_mask(
1072 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1076#define _mm512_getexp_round_ph(A, R) \
1077 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1078 (__v32hf)_mm512_undefined_ph(), \
1079 (__mmask32)-1, (int)(R)))
1081#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1082 ((__m512h)__builtin_ia32_getexpph512_mask( \
1083 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1085#define _mm512_maskz_getexp_round_ph(U, A, R) \
1086 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1087 (__v32hf)_mm512_setzero_ph(), \
1088 (__mmask32)(U), (int)(R)))
1092 return (__m512h)__builtin_ia32_scalefph512_mask(
1093 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1098_mm512_mask_scalef_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
1099 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1105_mm512_maskz_scalef_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
1106 return (__m512h)__builtin_ia32_scalefph512_mask(
1107 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1111#define _mm512_scalef_round_ph(A, B, R) \
1112 ((__m512h)__builtin_ia32_scalefph512_mask( \
1113 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1114 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1116#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1117 ((__m512h)__builtin_ia32_scalefph512_mask( \
1118 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1119 (__mmask32)(U), (int)(R)))
1121#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1122 ((__m512h)__builtin_ia32_scalefph512_mask( \
1123 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1124 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1126#define _mm512_roundscale_ph(A, B) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1128 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1129 _MM_FROUND_CUR_DIRECTION))
1131#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1133 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1134 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1136#define _mm512_maskz_roundscale_ph(A, B, imm) \
1137 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1138 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1139 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1141#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1142 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1143 (__v32hf)(__m512h)(A), \
1144 (__mmask32)(B), (int)(R)))
1146#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1147 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1148 (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(A), (int)(R)))
1151#define _mm512_roundscale_round_ph(A, imm, R) \
1152 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)_mm512_undefined_ph(), \
1154 (__mmask32)-1, (int)(R)))
1156#define _mm512_reduce_ph(A, imm) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask( \
1158 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1159 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1161#define _mm512_mask_reduce_ph(W, U, A, imm) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask( \
1163 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1164 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1166#define _mm512_maskz_reduce_ph(U, A, imm) \
1167 ((__m512h)__builtin_ia32_reduceph512_mask( \
1168 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1169 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1171#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1172 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1173 (__v32hf)(__m512h)(W), \
1174 (__mmask32)(U), (int)(R)))
1176#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1177 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1178 (__v32hf)_mm512_setzero_ph(), \
1179 (__mmask32)(U), (int)(R)))
1181#define _mm512_reduce_round_ph(A, imm, R) \
1182 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1183 (__v32hf)_mm512_undefined_ph(), \
1184 (__mmask32)-1, (int)(R)))
1188 return (__m128h)__builtin_ia32_rcpsh_mask(
1189 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1196 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1203 return (__m128h)__builtin_ia32_rcpsh_mask(
1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1209 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1210 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1217 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1222_mm_maskz_rsqrt_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1223 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1224 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1227#define _mm_getmant_round_sh(A, B, C, D, R) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1232#define _mm_getmant_sh(A, B, C, D) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1237#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1238 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1240 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1242#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1243 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1244 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1245 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1247#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1248 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1249 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1250 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1252#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1253 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1254 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1255 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1257#define _mm_getexp_round_sh(A, B, R) \
1258 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1259 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1260 (__mmask8)-1, (int)(R)))
1264 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1265 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1270_mm_mask_getexp_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1271 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1272 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1276#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1277 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1278 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1279 (__mmask8)(U), (int)(R)))
1282_mm_maskz_getexp_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1283 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1284 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1288#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1289 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1290 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1291 (__mmask8)(U), (int)(R)))
1293#define _mm_scalef_round_sh(A, B, R) \
1294 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1295 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1296 (__mmask8)-1, (int)(R)))
1300 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1301 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1306_mm_mask_scalef_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1307 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1312#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1313 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1314 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1315 (__mmask8)(U), (int)(R)))
1318_mm_maskz_scalef_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1319 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1320 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1324#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1325 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1327 (__mmask8)(U), (int)(R)))
1329#define _mm_roundscale_round_sh(A, B, imm, R) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)-1, (int)(imm), (int)(R)))
1334#define _mm_roundscale_sh(A, B, imm) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1339#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1340 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1342 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1344#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1345 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(I), (int)(R)))
1349#define _mm_maskz_roundscale_sh(U, A, B, I) \
1350 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1354#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1355 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)(U), (int)(I), (int)(R)))
1359#define _mm_reduce_sh(A, B, C) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1362 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1364#define _mm_mask_reduce_sh(W, U, A, B, C) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1367 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1369#define _mm_maskz_reduce_sh(U, A, B, C) \
1370 ((__m128h)__builtin_ia32_reducesh_mask( \
1371 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1372 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1374#define _mm_reduce_round_sh(A, B, C, R) \
1375 ((__m128h)__builtin_ia32_reducesh_mask( \
1376 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1377 (__mmask8)-1, (int)(C), (int)(R)))
1379#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1380 ((__m128h)__builtin_ia32_reducesh_mask( \
1381 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1382 (__mmask8)(U), (int)(C), (int)(R)))
1384#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1385 ((__m128h)__builtin_ia32_reducesh_mask( \
1386 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1387 (__mmask8)(U), (int)(C), (int)(R)))
1389#define _mm512_sqrt_round_ph(A, R) \
1390 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1392#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1393 ((__m512h)__builtin_ia32_selectph_512( \
1394 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1395 (__v32hf)(__m512h)(W)))
1397#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1398 ((__m512h)__builtin_ia32_selectph_512( \
1399 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1400 (__v32hf)_mm512_setzero_ph()))
1403 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1408_mm512_mask_sqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1409 return (__m512h)__builtin_ia32_selectph_512(
1412 (__v32hf)(__m512h)(__W));
1416_mm512_maskz_sqrt_ph(
__mmask32 __U, __m512h __A) {
1417 return (__m512h)__builtin_ia32_selectph_512(
1420 (__v32hf)_mm512_setzero_ph());
1423#define _mm_sqrt_round_sh(A, B, R) \
1424 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1425 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1426 (__mmask8)-1, (int)(R)))
1428#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1429 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1430 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1431 (__mmask8)(U), (int)(R)))
1433#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1434 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1435 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1436 (__mmask8)(U), (int)(R)))
1440 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1441 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1449 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1450 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1457 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1458 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1462#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1463 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1464 (int)(imm), (__mmask32)(U)))
1466#define _mm512_fpclass_ph_mask(A, imm) \
1467 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1468 (int)(imm), (__mmask32)-1))
1470#define _mm_fpclass_sh_mask(A, imm) \
1471 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1474#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1475 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1478#define _mm512_cvt_roundpd_ph(A, R) \
1479 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1480 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1482#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1483 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1484 (__mmask8)(U), (int)(R)))
1486#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1487 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1488 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1491 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1492 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1497_mm512_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m512d __A) {
1498 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1503_mm512_maskz_cvtpd_ph(
__mmask8 __U, __m512d __A) {
1504 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1505 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1509#define _mm512_cvt_roundph_pd(A, R) \
1510 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1511 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1513#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1514 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1515 (__mmask8)(U), (int)(R)))
1517#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1518 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1519 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1522 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1528_mm512_mask_cvtph_pd(__m512d __W,
__mmask8 __U, __m128h __A) {
1529 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1534_mm512_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
1535 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1540#define _mm_cvt_roundsh_ss(A, B, R) \
1541 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1542 (__v4sf)_mm_undefined_ps(), \
1543 (__mmask8)(-1), (int)(R)))
1545#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1546 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1547 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1549#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1550 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1551 (__v4sf)_mm_setzero_ps(), \
1552 (__mmask8)(U), (int)(R)))
1556 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1565 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1573 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1578#define _mm_cvt_roundss_sh(A, B, R) \
1579 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1580 (__v8hf)_mm_undefined_ph(), \
1581 (__mmask8)(-1), (int)(R)))
1583#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1584 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1585 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1587#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1588 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1589 (__v8hf)_mm_setzero_ph(), \
1590 (__mmask8)(U), (int)(R)))
1594 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1595 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1603 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1604 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1611 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1612 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1616#define _mm_cvt_roundsd_sh(A, B, R) \
1617 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1618 (__v8hf)_mm_undefined_ph(), \
1619 (__mmask8)(-1), (int)(R)))
1621#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1622 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1623 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1625#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1626 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1627 (__v8hf)_mm_setzero_ph(), \
1628 (__mmask8)(U), (int)(R)))
1632 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1633 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1641 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1642 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1647_mm_maskz_cvtsd_sh(
__mmask8 __U, __m128h __A, __m128d __B) {
1648 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1649 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1653#define _mm_cvt_roundsh_sd(A, B, R) \
1654 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1655 (__v2df)_mm_undefined_pd(), \
1656 (__mmask8)(-1), (int)(R)))
1658#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1659 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1660 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1662#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1663 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1664 (__v2df)_mm_setzero_pd(), \
1665 (__mmask8)(U), (int)(R)))
1669 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1678 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1679 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1684_mm_maskz_cvtsh_sd(
__mmask8 __U, __m128d __A, __m128h __B) {
1685 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1690#define _mm512_cvt_roundph_epi16(A, R) \
1691 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1692 (__v32hi)_mm512_undefined_epi32(), \
1693 (__mmask32)(-1), (int)(R)))
1695#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1696 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1697 (__mmask32)(U), (int)(R)))
1699#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1700 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1701 (__v32hi)_mm512_setzero_epi32(), \
1702 (__mmask32)(U), (int)(R)))
1705_mm512_cvtph_epi16(__m512h __A) {
1706 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1712_mm512_mask_cvtph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1713 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1718_mm512_maskz_cvtph_epi16(
__mmask32 __U, __m512h __A) {
1719 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1724#define _mm512_cvtt_roundph_epi16(A, R) \
1725 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1726 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1729#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1730 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1731 (__mmask32)(U), (int)(R)))
1733#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1734 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1735 (__v32hi)_mm512_setzero_epi32(), \
1736 (__mmask32)(U), (int)(R)))
1739_mm512_cvttph_epi16(__m512h __A) {
1740 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1746_mm512_mask_cvttph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1747 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1752_mm512_maskz_cvttph_epi16(
__mmask32 __U, __m512h __A) {
1753 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1758#define _mm512_cvt_roundepi16_ph(A, R) \
1759 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1760 (__v32hf)_mm512_undefined_ph(), \
1761 (__mmask32)(-1), (int)(R)))
1763#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1764 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1765 (__mmask32)(U), (int)(R)))
1767#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1768 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1769 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1772_mm512_cvtepi16_ph(__m512i __A) {
1773 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1774 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1779_mm512_mask_cvtepi16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1780 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1785_mm512_maskz_cvtepi16_ph(
__mmask32 __U, __m512i __A) {
1786 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1787 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1791#define _mm512_cvt_roundph_epu16(A, R) \
1792 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1793 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1796#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1797 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1798 (__mmask32)(U), (int)(R)))
1800#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1801 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1802 (__v32hu)_mm512_setzero_epi32(), \
1803 (__mmask32)(U), (int)(R)))
1806_mm512_cvtph_epu16(__m512h __A) {
1807 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1813_mm512_mask_cvtph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1814 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1819_mm512_maskz_cvtph_epu16(
__mmask32 __U, __m512h __A) {
1820 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1825#define _mm512_cvtt_roundph_epu16(A, R) \
1826 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1827 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1830#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1831 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1832 (__mmask32)(U), (int)(R)))
1834#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1835 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1836 (__v32hu)_mm512_setzero_epi32(), \
1837 (__mmask32)(U), (int)(R)))
1840_mm512_cvttph_epu16(__m512h __A) {
1841 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1847_mm512_mask_cvttph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1848 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1853_mm512_maskz_cvttph_epu16(
__mmask32 __U, __m512h __A) {
1854 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1859#define _mm512_cvt_roundepu16_ph(A, R) \
1860 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1861 (__v32hf)_mm512_undefined_ph(), \
1862 (__mmask32)(-1), (int)(R)))
1864#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1865 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1866 (__mmask32)(U), (int)(R)))
1868#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1869 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1870 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1873_mm512_cvtepu16_ph(__m512i __A) {
1874 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1875 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1880_mm512_mask_cvtepu16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1881 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1886_mm512_maskz_cvtepu16_ph(
__mmask32 __U, __m512i __A) {
1887 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1888 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1892#define _mm512_cvt_roundph_epi32(A, R) \
1893 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1894 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1897#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1898 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1899 (__mmask16)(U), (int)(R)))
1901#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1902 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1903 (__v16si)_mm512_setzero_epi32(), \
1904 (__mmask16)(U), (int)(R)))
1907_mm512_cvtph_epi32(__m256h __A) {
1908 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1914_mm512_mask_cvtph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
1915 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1920_mm512_maskz_cvtph_epi32(
__mmask16 __U, __m256h __A) {
1921 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1926#define _mm512_cvt_roundph_epu32(A, R) \
1927 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1928 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1931#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1932 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1933 (__mmask16)(U), (int)(R)))
1935#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1936 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1937 (__v16su)_mm512_setzero_epi32(), \
1938 (__mmask16)(U), (int)(R)))
1941_mm512_cvtph_epu32(__m256h __A) {
1942 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1948_mm512_mask_cvtph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
1949 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1954_mm512_maskz_cvtph_epu32(
__mmask16 __U, __m256h __A) {
1955 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1960#define _mm512_cvt_roundepi32_ph(A, R) \
1961 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1962 (__v16hf)_mm256_undefined_ph(), \
1963 (__mmask16)(-1), (int)(R)))
1965#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1966 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1967 (__mmask16)(U), (int)(R)))
1969#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1970 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1971 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1974_mm512_cvtepi32_ph(__m512i __A) {
1975 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1976 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1981_mm512_mask_cvtepi32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1982 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1987_mm512_maskz_cvtepi32_ph(
__mmask16 __U, __m512i __A) {
1988 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1989 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
1993#define _mm512_cvt_roundepu32_ph(A, R) \
1994 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1995 (__v16hf)_mm256_undefined_ph(), \
1996 (__mmask16)(-1), (int)(R)))
1998#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1999 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
2000 (__mmask16)(U), (int)(R)))
2002#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
2003 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
2004 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2007_mm512_cvtepu32_ph(__m512i __A) {
2008 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2009 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2014_mm512_mask_cvtepu32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
2015 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2020_mm512_maskz_cvtepu32_ph(
__mmask16 __U, __m512i __A) {
2021 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2022 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2026#define _mm512_cvtt_roundph_epi32(A, R) \
2027 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2028 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2031#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2032 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2033 (__mmask16)(U), (int)(R)))
2035#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2036 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2037 (__v16si)_mm512_setzero_epi32(), \
2038 (__mmask16)(U), (int)(R)))
2041_mm512_cvttph_epi32(__m256h __A) {
2042 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2048_mm512_mask_cvttph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
2049 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2054_mm512_maskz_cvttph_epi32(
__mmask16 __U, __m256h __A) {
2055 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2060#define _mm512_cvtt_roundph_epu32(A, R) \
2061 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2062 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2065#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2066 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2067 (__mmask16)(U), (int)(R)))
2069#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2070 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2071 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2075_mm512_cvttph_epu32(__m256h __A) {
2076 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2082_mm512_mask_cvttph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
2083 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2088_mm512_maskz_cvttph_epu32(
__mmask16 __U, __m256h __A) {
2089 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2094#define _mm512_cvt_roundepi64_ph(A, R) \
2095 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2096 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2098#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2099 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2100 (__mmask8)(U), (int)(R)))
2102#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2103 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2104 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2107_mm512_cvtepi64_ph(__m512i __A) {
2108 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2109 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2114_mm512_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2115 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2120_mm512_maskz_cvtepi64_ph(
__mmask8 __U, __m512i __A) {
2121 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2122 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2126#define _mm512_cvt_roundph_epi64(A, R) \
2127 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2128 (__v8di)_mm512_undefined_epi32(), \
2129 (__mmask8)(-1), (int)(R)))
2131#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2132 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2133 (__mmask8)(U), (int)(R)))
2135#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2136 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2137 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2140_mm512_cvtph_epi64(__m128h __A) {
2141 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2147_mm512_mask_cvtph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2148 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2153_mm512_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
2154 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2159#define _mm512_cvt_roundepu64_ph(A, R) \
2160 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2161 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2163#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2164 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2165 (__mmask8)(U), (int)(R)))
2167#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2168 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2169 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2172_mm512_cvtepu64_ph(__m512i __A) {
2173 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2174 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2179_mm512_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2180 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2185_mm512_maskz_cvtepu64_ph(
__mmask8 __U, __m512i __A) {
2186 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2187 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2191#define _mm512_cvt_roundph_epu64(A, R) \
2192 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2193 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2196#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2197 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2198 (__mmask8)(U), (int)(R)))
2200#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2201 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2202 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2205_mm512_cvtph_epu64(__m128h __A) {
2206 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2212_mm512_mask_cvtph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2213 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2218_mm512_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
2219 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2224#define _mm512_cvtt_roundph_epi64(A, R) \
2225 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2226 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2229#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2230 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2231 (__mmask8)(U), (int)(R)))
2233#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2235 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2238_mm512_cvttph_epi64(__m128h __A) {
2239 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2245_mm512_mask_cvttph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2246 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2251_mm512_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
2252 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2257#define _mm512_cvtt_roundph_epu64(A, R) \
2258 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2259 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2262#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2263 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2264 (__mmask8)(U), (int)(R)))
2266#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2267 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2268 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2271_mm512_cvttph_epu64(__m128h __A) {
2272 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2278_mm512_mask_cvttph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2279 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2284_mm512_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
2285 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2290#define _mm_cvt_roundsh_i32(A, R) \
2291 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2297#define _mm_cvt_roundsh_u32(A, R) \
2298 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2301_mm_cvtsh_u32(__m128h __A) {
2302 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2307#define _mm_cvt_roundsh_i64(A, R) \
2308 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2311 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2315#define _mm_cvt_roundsh_u64(A, R) \
2316 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2319_mm_cvtsh_u64(__m128h __A) {
2320 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2325#define _mm_cvt_roundu32_sh(A, B, R) \
2326 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2329_mm_cvtu32_sh(__m128h __A,
unsigned int __B) {
2335#define _mm_cvt_roundu64_sh(A, B, R) \
2336 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2340_mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2346#define _mm_cvt_roundi32_sh(A, B, R) \
2347 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2356#define _mm_cvt_roundi64_sh(A, B, R) \
2357 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2366#define _mm_cvtt_roundsh_i32(A, R) \
2367 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2370 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2375#define _mm_cvtt_roundsh_i64(A, R) \
2376 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2379 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2384#define _mm_cvtt_roundsh_u32(A, R) \
2385 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2388_mm_cvttsh_u32(__m128h __A) {
2389 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2394#define _mm_cvtt_roundsh_u64(A, R) \
2395 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2398_mm_cvttsh_u64(__m128h __A) {
2399 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2404#define _mm512_cvtx_roundph_ps(A, R) \
2405 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2406 (__v16sf)_mm512_undefined_ps(), \
2407 (__mmask16)(-1), (int)(R)))
2409#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2410 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2411 (__mmask16)(U), (int)(R)))
2413#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2414 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2415 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2418 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2424_mm512_mask_cvtxph_ps(__m512 __W,
__mmask16 __U, __m256h __A) {
2425 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2430_mm512_maskz_cvtxph_ps(
__mmask16 __U, __m256h __A) {
2431 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2436#define _mm512_cvtx_roundps_ph(A, R) \
2437 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2438 (__v16hf)_mm256_undefined_ph(), \
2439 (__mmask16)(-1), (int)(R)))
2441#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2442 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2443 (__mmask16)(U), (int)(R)))
2445#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2446 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2447 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2450 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2451 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2456_mm512_mask_cvtxps_ph(__m256h __W,
__mmask16 __U, __m512 __A) {
2457 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2462_mm512_maskz_cvtxps_ph(
__mmask16 __U, __m512 __A) {
2463 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2464 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2468#define _mm512_fmadd_round_ph(A, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2471 (__mmask32)-1, (int)(R)))
2473#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2478#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2481 (__mmask32)(U), (int)(R)))
2483#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2485 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)(U), (int)(R)))
2488#define _mm512_fmsub_round_ph(A, B, C, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2490 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2491 (__mmask32)-1, (int)(R)))
2493#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2495 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2496 (__mmask32)(U), (int)(R)))
2498#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2500 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)(U), (int)(R)))
2503#define _mm512_fnmadd_round_ph(A, B, C, R) \
2504 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2505 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2506 (__mmask32)-1, (int)(R)))
2508#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2509 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2510 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2511 (__mmask32)(U), (int)(R)))
2513#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2514 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2515 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2516 (__mmask32)(U), (int)(R)))
2518#define _mm512_fnmsub_round_ph(A, B, C, R) \
2519 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2520 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2521 (__mmask32)-1, (int)(R)))
2523#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2524 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2525 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2526 (__mmask32)(U), (int)(R)))
2531 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537_mm512_mask_fmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2538 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2545 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2551_mm512_maskz_fmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2552 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2560 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2566_mm512_mask_fmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2567 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2573_mm512_maskz_fmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2574 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2575 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2582 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2588_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2589 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2595_mm512_maskz_fnmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2596 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2604 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2610_mm512_maskz_fnmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2611 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2612 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2616#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2619 (__mmask32)-1, (int)(R)))
2621#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2626#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2627 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2629 (__mmask32)(U), (int)(R)))
2631#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2632 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2633 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2634 (__mmask32)(U), (int)(R)))
2636#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2637 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2638 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2639 (__mmask32)-1, (int)(R)))
2641#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2642 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2643 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2644 (__mmask32)(U), (int)(R)))
2646#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2647 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2648 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2649 (__mmask32)(U), (int)(R)))
2652_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2653 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2654 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2659_mm512_mask_fmaddsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2660 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2661 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2666_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2667 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2668 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2673_mm512_maskz_fmaddsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2674 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2675 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2680_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2681 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2682 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2687_mm512_mask_fmsubadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2688 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2689 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2694_mm512_maskz_fmsubadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2695 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2696 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2700#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2701 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2702 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2706_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2707 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2712#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2713 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2714 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2718_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2719 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2720 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2724#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2725 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2726 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2727 (__mmask32)(U), (int)(R)))
2730_mm512_mask_fnmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2731 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2736#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2737 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2738 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2739 (__mmask32)(U), (int)(R)))
2741#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2742 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2743 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2744 (__mmask32)(U), (int)(R)))
2747_mm512_mask_fnmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2748 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2754_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2755 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2763 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2771 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2775#define _mm_fmadd_round_sh(A, B, C, R) \
2776 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2777 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2778 (__mmask8)-1, (int)(R)))
2780#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2783 (__mmask8)(U), (int)(R)))
2786_mm_maskz_fmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2787 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2792#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2793 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2794 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2795 (__mmask8)(U), (int)(R)))
2798_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2799 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2804#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2805 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2806 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2807 (__mmask8)(U), (int)(R)))
2812 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2821 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2826#define _mm_fmsub_round_sh(A, B, C, R) \
2827 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2828 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2829 (__mmask8)-1, (int)(R)))
2831#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2832 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2834 (__mmask8)(U), (int)(R)))
2837_mm_maskz_fmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2838 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2843#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2844 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2845 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2846 (__mmask8)(U), (int)R))
2849_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2850 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2855#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2856 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2857 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2858 (__mmask8)(U), (int)(R)))
2863 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2868_mm_mask_fnmadd_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2869 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2873#define _mm_fnmadd_round_sh(A, B, C, R) \
2874 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2875 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2876 (__mmask8)-1, (int)(R)))
2878#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2881 (__mmask8)(U), (int)(R)))
2884_mm_maskz_fnmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2885 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2890#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2891 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2892 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2893 (__mmask8)(U), (int)(R)))
2896_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2897 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2902#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2904 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2905 (__mmask8)(U), (int)(R)))
2910 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2915_mm_mask_fnmsub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2916 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2920#define _mm_fnmsub_round_sh(A, B, C, R) \
2921 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2922 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2923 (__mmask8)-1, (int)(R)))
2925#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2926 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2928 (__mmask8)(U), (int)(R)))
2931_mm_maskz_fnmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2932 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2937#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2938 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2939 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2940 (__mmask8)(U), (int)(R)))
2943_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2944 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2949#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2950 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2951 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2952 (__mmask8)(U), (int)(R)))
2957 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2963_mm_mask_fcmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2964 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2969_mm_maskz_fcmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2970 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2976_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
2977 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2981#define _mm_fcmadd_round_sch(A, B, C, R) \
2982 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2983 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2984 (__mmask8)-1, (int)(R)))
2986#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2987 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2988 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2989 (__mmask8)(U), (int)(R)))
2991#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2992 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2993 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2994 (__mmask8)(U), (int)(R)))
2996#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2997 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2998 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2999 (__mmask8)(U), (int)(R)))
3004 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
3010_mm_mask_fmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
3011 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
3016_mm_maskz_fmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3017 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3023_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
3024 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3028#define _mm_fmadd_round_sch(A, B, C, R) \
3029 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3030 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3031 (__mmask8)-1, (int)(R)))
3033#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3034 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3035 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3036 (__mmask8)(U), (int)(R)))
3038#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3039 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3040 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3041 (__mmask8)(U), (int)(R)))
3043#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3044 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3045 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3046 (__mmask8)(U), (int)(R)))
3050 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3051 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3056_mm_mask_fcmul_sch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
3057 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3063_mm_maskz_fcmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3064 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3065 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3069#define _mm_fcmul_round_sch(A, B, R) \
3070 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3071 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3072 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3074#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3075 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3076 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3077 (__mmask8)(U), (int)(R)))
3079#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3080 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3081 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3082 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3086 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3087 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3095 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3101_mm_maskz_fmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3102 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3103 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3107#define _mm_fmul_round_sch(A, B, R) \
3108 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3109 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3110 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3112#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3113 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3114 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3115 (__mmask8)(U), (int)(R)))
3117#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3118 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3119 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3120 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3124 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3125 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3130_mm512_mask_fcmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3131 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3137_mm512_maskz_fcmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3138 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3139 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3143#define _mm512_fcmul_round_pch(A, B, R) \
3144 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3145 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3146 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3148#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3149 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3150 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3151 (__mmask16)(U), (int)(R)))
3153#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3154 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3155 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3156 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3160 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3161 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3166_mm512_mask_fmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3167 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3173_mm512_maskz_fmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3174 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3175 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3179#define _mm512_fmul_round_pch(A, B, R) \
3180 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3181 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3182 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3184#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3185 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3186 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3187 (__mmask16)(U), (int)(R)))
3189#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3190 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3191 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3192 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3197 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3198 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3203_mm512_mask_fcmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3204 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3205 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3210_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3211 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3212 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3217_mm512_maskz_fcmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3218 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3219 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3223#define _mm512_fcmadd_round_pch(A, B, C, R) \
3224 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3225 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3226 (__mmask16)-1, (int)(R)))
3228#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3229 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3230 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3231 (__mmask16)(U), (int)(R)))
3233#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3234 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3235 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3236 (__mmask16)(U), (int)(R)))
3238#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3239 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3240 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3241 (__mmask16)(U), (int)(R)))
3246 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3252_mm512_mask_fmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3253 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3259_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3260 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3261 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3266_mm512_maskz_fmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3267 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3268 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3272#define _mm512_fmadd_round_pch(A, B, C, R) \
3273 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3274 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3275 (__mmask16)-1, (int)(R)))
3277#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3278 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3279 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3280 (__mmask16)(U), (int)(R)))
3282#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3283 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3284 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3285 (__mmask16)(U), (int)(R)))
3287#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3288 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3289 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3290 (__mmask16)(U), (int)(R)))
3293_mm512_reduce_add_ph(__m512h __W) {
3294 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3298_mm512_reduce_mul_ph(__m512h __W) {
3299 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3303_mm512_reduce_max_ph(__m512h __V) {
3304 return __builtin_ia32_reduce_fmax_ph512(__V);
3308_mm512_reduce_min_ph(__m512h __V) {
3309 return __builtin_ia32_reduce_fmin_ph512(__V);
3313_mm512_mask_blend_ph(
__mmask32 __U, __m512h __A, __m512h __W) {
3314 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3319_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3320 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3325_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3326 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3330#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3331#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3332#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3333#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3334#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3335 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3336#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3337 _mm512_maskz_fmul_round_pch(U, A, B, R)
3339#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3340#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3341#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3342#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3343#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3344 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3345#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3346 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3348#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3349#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3350#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3351#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3352#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3353 _mm_mask_fmul_round_sch(W, U, A, B, R)
3354#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3356#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3357#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3358#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3359#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3360#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3361 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3362#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3363 _mm_maskz_fcmul_round_sch(U, A, B, R)
3365#undef __DEFAULT_FN_ATTRS128
3366#undef __DEFAULT_FN_ATTRS256
3367#undef __DEFAULT_FN_ATTRS512
3368#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3369#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3370#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
#define _MM_FROUND_CUR_DIRECTION
static __inline__ void int __a
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.