10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
13 #ifndef __AVX512FP16INTRIN_H
14 #define __AVX512FP16INTRIN_H
28 #define __DEFAULT_FN_ATTRS512 \
29 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
30 __min_vector_width__(512)))
31 #define __DEFAULT_FN_ATTRS256 \
32 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
33 __min_vector_width__(256)))
34 #define __DEFAULT_FN_ATTRS128 \
35 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
36 __min_vector_width__(128)))
43 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
47 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
48 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
52 return (__m256h)__builtin_ia32_undef256();
56 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
57 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
58 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
62 return (__m128h)__builtin_ia32_undef128();
66 return (__m512h)__builtin_ia32_undef512();
70 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
71 __h, __h, __h, __h, __h, __h, __h, __h,
72 __h, __h, __h, __h, __h, __h, __h, __h,
73 __h, __h, __h, __h, __h, __h, __h, __h};
85 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
86 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
87 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
88 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
89 __h4, __h3, __h2, __h1};
92 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
93 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
94 h25, h26, h27, h28, h29, h30, h31, h32) \
95 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
96 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
97 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
98 (h5), (h4), (h3), (h2), (h1))
183 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
188 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
193 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
199 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
205 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
206 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
207 -1, -1, -1, -1, -1, -1, -1, -1, -1);
212 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
213 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
214 -1, -1, -1, -1, -1, -1, -1, -1);
233 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
251 return __builtin_shufflevector(
252 __a, (__v8hf)
_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
253 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
272 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
273 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
277 #define _mm_comi_round_sh(A, B, P, R) \
278 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
280 #define _mm_comi_sh(A, B, pred) \
281 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
285 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_EQ_OS,
291 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_LT_OS,
297 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_LE_OS,
303 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_GT_OS,
309 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_GE_OS,
315 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_NEQ_US,
321 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_EQ_OQ,
327 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_LT_OQ,
333 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_LE_OQ,
339 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_GT_OQ,
345 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_GE_OQ,
351 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B,
_CMP_NEQ_UQ,
357 return (__m512h)((__v32hf)__A + (__v32hf)__B);
362 return (__m512h)__builtin_ia32_selectph_512(
368 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
373 #define _mm512_add_round_ph(A, B, R) \
374 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
375 (__v32hf)(__m512h)(B), (int)(R)))
377 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
378 ((__m512h)__builtin_ia32_selectph_512( \
379 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
380 (__v32hf)(__m512h)(W)))
382 #define _mm512_maskz_add_round_ph(U, A, B, R) \
383 ((__m512h)__builtin_ia32_selectph_512( \
384 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
385 (__v32hf)_mm512_setzero_ph()))
389 return (__m512h)((__v32hf)__A - (__v32hf)__B);
394 return (__m512h)__builtin_ia32_selectph_512(
400 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
405 #define _mm512_sub_round_ph(A, B, R) \
406 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
407 (__v32hf)(__m512h)(B), (int)(R)))
409 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
410 ((__m512h)__builtin_ia32_selectph_512( \
411 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
412 (__v32hf)(__m512h)(W)))
414 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
415 ((__m512h)__builtin_ia32_selectph_512( \
416 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
417 (__v32hf)_mm512_setzero_ph()))
421 return (__m512h)((__v32hf)__A * (__v32hf)__B);
426 return (__m512h)__builtin_ia32_selectph_512(
432 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
437 #define _mm512_mul_round_ph(A, B, R) \
438 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
439 (__v32hf)(__m512h)(B), (int)(R)))
441 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
442 ((__m512h)__builtin_ia32_selectph_512( \
443 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
444 (__v32hf)(__m512h)(W)))
446 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
447 ((__m512h)__builtin_ia32_selectph_512( \
448 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
449 (__v32hf)_mm512_setzero_ph()))
453 return (__m512h)((__v32hf)__A / (__v32hf)__B);
458 return (__m512h)__builtin_ia32_selectph_512(
464 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
469 #define _mm512_div_round_ph(A, B, R) \
470 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
471 (__v32hf)(__m512h)(B), (int)(R)))
473 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
474 ((__m512h)__builtin_ia32_selectph_512( \
475 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
476 (__v32hf)(__m512h)(W)))
478 #define _mm512_maskz_div_round_ph(U, A, B, R) \
479 ((__m512h)__builtin_ia32_selectph_512( \
480 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
481 (__v32hf)_mm512_setzero_ph()))
485 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
491 return (__m512h)__builtin_ia32_selectph_512(
497 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
502 #define _mm512_min_round_ph(A, B, R) \
503 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
504 (__v32hf)(__m512h)(B), (int)(R)))
506 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
507 ((__m512h)__builtin_ia32_selectph_512( \
508 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
509 (__v32hf)(__m512h)(W)))
511 #define _mm512_maskz_min_round_ph(U, A, B, R) \
512 ((__m512h)__builtin_ia32_selectph_512( \
513 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
514 (__v32hf)_mm512_setzero_ph()))
518 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
524 return (__m512h)__builtin_ia32_selectph_512(
530 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
535 #define _mm512_max_round_ph(A, B, R) \
536 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
537 (__v32hf)(__m512h)(B), (int)(R)))
539 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
540 ((__m512h)__builtin_ia32_selectph_512( \
541 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
542 (__v32hf)(__m512h)(W)))
544 #define _mm512_maskz_max_round_ph(U, A, B, R) \
545 ((__m512h)__builtin_ia32_selectph_512( \
546 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
547 (__v32hf)_mm512_setzero_ph()))
559 return (__m512h)__builtin_ia32_selectps_512(
565 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
581 return __builtin_ia32_selectsh_128(__U, __A, __W);
591 #define _mm_add_round_sh(A, B, R) \
592 ((__m128h)__builtin_ia32_addsh_round_mask( \
593 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
594 (__mmask8)-1, (int)(R)))
596 #define _mm_mask_add_round_sh(W, U, A, B, R) \
597 ((__m128h)__builtin_ia32_addsh_round_mask( \
598 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
599 (__mmask8)(U), (int)(R)))
601 #define _mm_maskz_add_round_sh(U, A, B, R) \
602 ((__m128h)__builtin_ia32_addsh_round_mask( \
603 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
604 (__mmask8)(U), (int)(R)))
617 return __builtin_ia32_selectsh_128(__U, __A, __W);
627 #define _mm_sub_round_sh(A, B, R) \
628 ((__m128h)__builtin_ia32_subsh_round_mask( \
629 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
630 (__mmask8)-1, (int)(R)))
632 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
633 ((__m128h)__builtin_ia32_subsh_round_mask( \
634 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
635 (__mmask8)(U), (int)(R)))
637 #define _mm_maskz_sub_round_sh(U, A, B, R) \
638 ((__m128h)__builtin_ia32_subsh_round_mask( \
639 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
640 (__mmask8)(U), (int)(R)))
653 return __builtin_ia32_selectsh_128(__U, __A, __W);
663 #define _mm_mul_round_sh(A, B, R) \
664 ((__m128h)__builtin_ia32_mulsh_round_mask( \
665 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
666 (__mmask8)-1, (int)(R)))
668 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
669 ((__m128h)__builtin_ia32_mulsh_round_mask( \
670 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
671 (__mmask8)(U), (int)(R)))
673 #define _mm_maskz_mul_round_sh(U, A, B, R) \
674 ((__m128h)__builtin_ia32_mulsh_round_mask( \
675 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
676 (__mmask8)(U), (int)(R)))
689 return __builtin_ia32_selectsh_128(__U, __A, __W);
699 #define _mm_div_round_sh(A, B, R) \
700 ((__m128h)__builtin_ia32_divsh_round_mask( \
701 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
702 (__mmask8)-1, (int)(R)))
704 #define _mm_mask_div_round_sh(W, U, A, B, R) \
705 ((__m128h)__builtin_ia32_divsh_round_mask( \
706 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
707 (__mmask8)(U), (int)(R)))
709 #define _mm_maskz_div_round_sh(U, A, B, R) \
710 ((__m128h)__builtin_ia32_divsh_round_mask( \
711 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
712 (__mmask8)(U), (int)(R)))
716 return (__m128h)__builtin_ia32_minsh_round_mask(
725 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
733 return (__m128h)__builtin_ia32_minsh_round_mask(
738 #define _mm_min_round_sh(A, B, R) \
739 ((__m128h)__builtin_ia32_minsh_round_mask( \
740 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
741 (__mmask8)-1, (int)(R)))
743 #define _mm_mask_min_round_sh(W, U, A, B, R) \
744 ((__m128h)__builtin_ia32_minsh_round_mask( \
745 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
746 (__mmask8)(U), (int)(R)))
748 #define _mm_maskz_min_round_sh(U, A, B, R) \
749 ((__m128h)__builtin_ia32_minsh_round_mask( \
750 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
751 (__mmask8)(U), (int)(R)))
755 return (__m128h)__builtin_ia32_maxsh_round_mask(
764 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
772 return (__m128h)__builtin_ia32_maxsh_round_mask(
777 #define _mm_max_round_sh(A, B, R) \
778 ((__m128h)__builtin_ia32_maxsh_round_mask( \
779 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
780 (__mmask8)-1, (int)(R)))
782 #define _mm_mask_max_round_sh(W, U, A, B, R) \
783 ((__m128h)__builtin_ia32_maxsh_round_mask( \
784 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
785 (__mmask8)(U), (int)(R)))
787 #define _mm_maskz_max_round_sh(U, A, B, R) \
788 ((__m128h)__builtin_ia32_maxsh_round_mask( \
789 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
790 (__mmask8)(U), (int)(R)))
792 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
793 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
794 (__v32hf)(__m512h)(B), (int)(P), \
795 (__mmask32)-1, (int)(R)))
797 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
798 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
799 (__v32hf)(__m512h)(B), (int)(P), \
800 (__mmask32)(U), (int)(R)))
802 #define _mm512_cmp_ph_mask(A, B, P) \
803 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
805 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
806 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
808 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
809 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
810 (__v8hf)(__m128h)(Y), (int)(P), \
811 (__mmask8)-1, (int)(R)))
813 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
814 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
815 (__v8hf)(__m128h)(Y), (int)(P), \
816 (__mmask8)(M), (int)(R)))
818 #define _mm_cmp_sh_mask(X, Y, P) \
819 ((__mmask8)__builtin_ia32_cmpsh_mask( \
820 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
821 _MM_FROUND_CUR_DIRECTION))
823 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
824 ((__mmask8)__builtin_ia32_cmpsh_mask( \
825 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
826 _MM_FROUND_CUR_DIRECTION))
829 struct __mm_load_sh_struct {
832 _Float16 __u = ((
struct __mm_load_sh_struct *)__dp)->__u;
833 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
838 __m128h src = (__v8hf)__builtin_shufflevector(
841 return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
846 return (__m128h)__builtin_ia32_loadsh128_mask(
852 return *(
const __m512h *)
__p;
857 return *(
const __m256h *)
__p;
861 return *(
const __m128h *)
__p;
869 return ((
const struct __loadu_ph *)
__p)->__v;
877 return ((
const struct __loadu_ph *)
__p)->__v;
884 return ((
const struct __loadu_ph *)
__p)->__v;
890 struct __mm_store_sh_struct {
893 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
899 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
904 *(__m512h *)
__P = __A;
909 *(__m256h *)
__P = __A;
914 *(__m128h *)
__P = __A;
922 ((
struct __storeu_ph *)
__P)->__v = __A;
930 ((
struct __storeu_ph *)
__P)->__v = __A;
938 ((
struct __storeu_ph *)
__P)->__v = __A;
952 return __builtin_ia32_selectsh_128(__U,
_mm_move_sh(__A, __B), __W);
958 return __builtin_ia32_selectsh_128(__U,
_mm_move_sh(__A, __B),
964 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
973 return (__m512h)__builtin_ia32_rcpph512_mask(
979 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
985 return (__m512h)__builtin_ia32_rcpph512_mask(
990 return (__m512h)__builtin_ia32_rsqrtph512_mask(
996 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1006 #define _mm512_getmant_ph(A, B, C) \
1007 ((__m512h)__builtin_ia32_getmantph512_mask( \
1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1010 _MM_FROUND_CUR_DIRECTION))
1012 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1022 #define _mm512_getmant_round_ph(A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1027 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1030 (__mmask32)(U), (int)(R)))
1032 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1038 return (__m512h)__builtin_ia32_getexpph512_mask(
1045 return (__m512h)__builtin_ia32_getexpph512_mask(
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1056 #define _mm512_getexp_round_ph(A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1058 (__v32hf)_mm512_undefined_ph(), \
1059 (__mmask32)-1, (int)(R)))
1061 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask( \
1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1065 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1067 (__v32hf)_mm512_setzero_ph(), \
1068 (__mmask32)(U), (int)(R)))
1072 return (__m512h)__builtin_ia32_scalefph512_mask(
1079 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1086 return (__m512h)__builtin_ia32_scalefph512_mask(
1091 #define _mm512_scalef_round_ph(A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1096 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1099 (__mmask32)(U), (int)(R)))
1101 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1102 ((__m512h)__builtin_ia32_scalefph512_mask( \
1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1106 #define _mm512_roundscale_ph(A, B) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1109 _MM_FROUND_CUR_DIRECTION))
1111 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1116 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1121 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1123 (__v32hf)(__m512h)(A), \
1124 (__mmask32)(B), (int)(R)))
1126 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1128 (__v32hf)_mm512_setzero_ph(), \
1129 (__mmask32)(A), (int)(R)))
1131 #define _mm512_roundscale_round_ph(A, imm, R) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1133 (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, (int)(R)))
1136 #define _mm512_reduce_ph(A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1141 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146 #define _mm512_maskz_reduce_ph(U, A, imm) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask( \
1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1151 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)(__m512h)(W), \
1154 (__mmask32)(U), (int)(R)))
1156 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_setzero_ph(), \
1159 (__mmask32)(U), (int)(R)))
1161 #define _mm512_reduce_round_ph(A, imm, R) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163 (__v32hf)_mm512_undefined_ph(), \
1164 (__mmask32)-1, (int)(R)))
1168 return (__m128h)__builtin_ia32_rcpsh_mask(
1176 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1183 return (__m128h)__builtin_ia32_rcpsh_mask(
1189 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1197 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1207 #define _mm_getmant_round_sh(A, B, C, D, R) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1212 #define _mm_getmant_sh(A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1217 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1222 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1227 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1232 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1237 #define _mm_getexp_round_sh(A, B, R) \
1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1240 (__mmask8)-1, (int)(R)))
1244 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1251 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1252 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1256 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1259 (__mmask8)(U), (int)(R)))
1263 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1268 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)(U), (int)(R)))
1273 #define _mm_scalef_round_sh(A, B, R) \
1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1276 (__mmask8)-1, (int)(R)))
1280 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1287 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1292 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1295 (__mmask8)(U), (int)(R)))
1299 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1304 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)(U), (int)(R)))
1309 #define _mm_roundscale_round_sh(A, B, imm, R) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), (int)(R)))
1314 #define _mm_roundscale_sh(A, B, imm) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1319 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1324 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1327 (__mmask8)(U), (int)(I), (int)(R)))
1329 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1334 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)(U), (int)(I), (int)(R)))
1339 #define _mm_reduce_sh(A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1344 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349 #define _mm_maskz_reduce_sh(U, A, B, C) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1354 #define _mm_reduce_round_sh(A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)-1, (int)(C), (int)(R)))
1359 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1364 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1367 (__mmask8)(U), (int)(C), (int)(R)))
1369 #define _mm512_sqrt_round_ph(A, R) \
1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1372 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)(__m512h)(W)))
1377 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1378 ((__m512h)__builtin_ia32_selectph_512( \
1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1380 (__v32hf)_mm512_setzero_ph()))
1383 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1389 return (__m512h)__builtin_ia32_selectph_512(
1392 (__v32hf)(__m512h)(__W));
1397 return (__m512h)__builtin_ia32_selectph_512(
1403 #define _mm_sqrt_round_sh(A, B, R) \
1404 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1405 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1406 (__mmask8)-1, (int)(R)))
1408 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1409 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1410 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1411 (__mmask8)(U), (int)(R)))
1413 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1414 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1415 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1416 (__mmask8)(U), (int)(R)))
1420 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1421 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)
_mm_setzero_ph(),
1429 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1430 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1437 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1438 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)
_mm_setzero_ph(),
1442 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)(U)))
1446 #define _mm512_fpclass_ph_mask(A, imm) \
1447 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1448 (int)(imm), (__mmask32)-1))
1450 #define _mm_fpclass_sh_mask(A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1455 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1458 #define _mm512_cvt_roundpd_ph(A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1460 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1462 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1464 (__mmask8)(U), (int)(R)))
1466 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1467 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1468 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1471 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1478 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1484 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1489 #define _mm512_cvt_roundph_pd(A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1491 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1493 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1495 (__mmask8)(U), (int)(R)))
1497 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1498 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1499 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1502 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1509 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1515 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1520 #define _mm_cvt_roundsh_ss(A, B, R) \
1521 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1522 (__v4sf)_mm_undefined_ps(), \
1523 (__mmask8)(-1), (int)(R)))
1525 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1527 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1529 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1530 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1531 (__v4sf)_mm_setzero_ps(), \
1532 (__mmask8)(U), (int)(R)))
1536 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1545 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1553 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1558 #define _mm_cvt_roundss_sh(A, B, R) \
1559 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1560 (__v8hf)_mm_undefined_ph(), \
1561 (__mmask8)(-1), (int)(R)))
1563 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1565 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1567 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1568 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1569 (__v8hf)_mm_setzero_ph(), \
1570 (__mmask8)(U), (int)(R)))
1574 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1583 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1584 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1591 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1596 #define _mm_cvt_roundsd_sh(A, B, R) \
1597 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1598 (__v8hf)_mm_undefined_ph(), \
1599 (__mmask8)(-1), (int)(R)))
1601 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1603 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1605 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1606 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1607 (__v8hf)_mm_setzero_ph(), \
1608 (__mmask8)(U), (int)(R)))
1612 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1621 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1622 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1628 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1633 #define _mm_cvt_roundsh_sd(A, B, R) \
1634 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1635 (__v2df)_mm_undefined_pd(), \
1636 (__mmask8)(-1), (int)(R)))
1638 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1640 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1642 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1643 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1644 (__v2df)_mm_setzero_pd(), \
1645 (__mmask8)(U), (int)(R)))
1649 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1658 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1659 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1665 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1670 #define _mm512_cvt_roundph_epi16(A, R) \
1671 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1672 (__v32hi)_mm512_undefined_epi32(), \
1673 (__mmask32)(-1), (int)(R)))
1675 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1677 (__mmask32)(U), (int)(R)))
1679 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1680 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1681 (__v32hi)_mm512_setzero_epi32(), \
1682 (__mmask32)(U), (int)(R)))
1686 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1693 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1699 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1704 #define _mm512_cvtt_roundph_epi16(A, R) \
1705 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1706 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1709 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1711 (__mmask32)(U), (int)(R)))
1713 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1714 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1715 (__v32hi)_mm512_setzero_epi32(), \
1716 (__mmask32)(U), (int)(R)))
1720 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1727 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1733 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1738 #define _mm512_cvt_roundepi16_ph(A, R) \
1739 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1740 (__v32hf)_mm512_undefined_ph(), \
1741 (__mmask32)(-1), (int)(R)))
1743 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1745 (__mmask32)(U), (int)(R)))
1747 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1748 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1749 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1753 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1760 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1766 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1771 #define _mm512_cvt_roundph_epu16(A, R) \
1772 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1773 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1776 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1778 (__mmask32)(U), (int)(R)))
1780 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1781 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1782 (__v32hu)_mm512_setzero_epi32(), \
1783 (__mmask32)(U), (int)(R)))
1787 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1794 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1800 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1805 #define _mm512_cvtt_roundph_epu16(A, R) \
1806 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1807 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1810 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1812 (__mmask32)(U), (int)(R)))
1814 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1815 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1816 (__v32hu)_mm512_setzero_epi32(), \
1817 (__mmask32)(U), (int)(R)))
1821 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1828 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1834 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1839 #define _mm512_cvt_roundepu16_ph(A, R) \
1840 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1841 (__v32hf)_mm512_undefined_ph(), \
1842 (__mmask32)(-1), (int)(R)))
1844 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1846 (__mmask32)(U), (int)(R)))
1848 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1849 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1850 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1854 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1861 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1867 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1872 #define _mm512_cvt_roundph_epi32(A, R) \
1873 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1874 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1877 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1879 (__mmask16)(U), (int)(R)))
1881 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1882 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1883 (__v16si)_mm512_setzero_epi32(), \
1884 (__mmask16)(U), (int)(R)))
1888 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1895 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1901 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1906 #define _mm512_cvt_roundph_epu32(A, R) \
1907 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1908 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1911 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1913 (__mmask16)(U), (int)(R)))
1915 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1916 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1917 (__v16su)_mm512_setzero_epi32(), \
1918 (__mmask16)(U), (int)(R)))
1922 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1929 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1935 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1940 #define _mm512_cvt_roundepi32_ph(A, R) \
1941 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1942 (__v16hf)_mm256_undefined_ph(), \
1943 (__mmask16)(-1), (int)(R)))
1945 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1947 (__mmask16)(U), (int)(R)))
1949 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1950 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1951 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1955 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1962 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1968 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1973 #define _mm512_cvt_roundepu32_ph(A, R) \
1974 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1975 (__v16hf)_mm256_undefined_ph(), \
1976 (__mmask16)(-1), (int)(R)))
1978 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1980 (__mmask16)(U), (int)(R)))
1982 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1983 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1984 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1988 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1995 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2001 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2006 #define _mm512_cvtt_roundph_epi32(A, R) \
2007 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2008 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2011 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2013 (__mmask16)(U), (int)(R)))
2015 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2016 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2017 (__v16si)_mm512_setzero_epi32(), \
2018 (__mmask16)(U), (int)(R)))
2022 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2029 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2035 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2040 #define _mm512_cvtt_roundph_epu32(A, R) \
2041 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2042 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2045 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2047 (__mmask16)(U), (int)(R)))
2049 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2050 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2051 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2056 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2063 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2069 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2074 #define _mm512_cvt_roundepi64_ph(A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2076 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2078 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2080 (__mmask8)(U), (int)(R)))
2082 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2083 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2084 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2088 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2095 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2101 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2106 #define _mm512_cvt_roundph_epi64(A, R) \
2107 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2108 (__v8di)_mm512_undefined_epi32(), \
2109 (__mmask8)(-1), (int)(R)))
2111 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2113 (__mmask8)(U), (int)(R)))
2115 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2116 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2117 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2121 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2128 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2134 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2139 #define _mm512_cvt_roundepu64_ph(A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2141 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2143 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2145 (__mmask8)(U), (int)(R)))
2147 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2148 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2149 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2153 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2160 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2166 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2171 #define _mm512_cvt_roundph_epu64(A, R) \
2172 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2173 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2176 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2178 (__mmask8)(U), (int)(R)))
2180 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2181 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2182 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2186 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2193 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2199 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2204 #define _mm512_cvtt_roundph_epi64(A, R) \
2205 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2206 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2209 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2211 (__mmask8)(U), (int)(R)))
2213 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2214 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2215 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2219 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2226 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2232 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2237 #define _mm512_cvtt_roundph_epu64(A, R) \
2238 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2239 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2242 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2244 (__mmask8)(U), (int)(R)))
2246 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2247 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2248 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2252 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2259 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2265 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2270 #define _mm_cvt_roundsh_i32(A, R) \
2271 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2277 #define _mm_cvt_roundsh_u32(A, R) \
2278 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2282 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2287 #define _mm_cvt_roundsh_i64(A, R) \
2288 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2291 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295 #define _mm_cvt_roundsh_u64(A, R) \
2296 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2299 _mm_cvtsh_u64(__m128h __A) {
2300 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2303 #endif // __x86_64__
2305 #define _mm_cvt_roundu32_sh(A, B, R) \
2306 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2315 #define _mm_cvt_roundu64_sh(A, B, R) \
2316 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2320 _mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2326 #define _mm_cvt_roundi32_sh(A, B, R) \
2327 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2336 #define _mm_cvt_roundi64_sh(A, B, R) \
2337 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2346 #define _mm_cvtt_roundsh_i32(A, R) \
2347 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2350 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2355 #define _mm_cvtt_roundsh_i64(A, R) \
2356 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2359 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2364 #define _mm_cvtt_roundsh_u32(A, R) \
2365 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2369 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2374 #define _mm_cvtt_roundsh_u64(A, R) \
2375 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2378 _mm_cvttsh_u64(__m128h __A) {
2379 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2384 #define _mm512_cvtx_roundph_ps(A, R) \
2385 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2386 (__v16sf)_mm512_undefined_ps(), \
2387 (__mmask16)(-1), (int)(R)))
2389 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2391 (__mmask16)(U), (int)(R)))
2393 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2394 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2395 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2398 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2405 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2411 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2416 #define _mm512_cvtx_roundps_ph(A, R) \
2417 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2418 (__v16hf)_mm256_undefined_ph(), \
2419 (__mmask16)(-1), (int)(R)))
2421 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2423 (__mmask16)(U), (int)(R)))
2425 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2426 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2427 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2430 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2437 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2443 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2448 #define _mm512_fmadd_round_ph(A, B, C, R) \
2449 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2450 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2451 (__mmask32)-1, (int)(R)))
2453 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2454 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2455 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2456 (__mmask32)(U), (int)(R)))
2458 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2459 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2460 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2461 (__mmask32)(U), (int)(R)))
2463 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2464 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2465 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2466 (__mmask32)(U), (int)(R)))
2468 #define _mm512_fmsub_round_ph(A, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2471 (__mmask32)-1, (int)(R)))
2473 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2478 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2481 (__mmask32)(U), (int)(R)))
2483 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2485 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)-1, (int)(R)))
2488 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2490 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2491 (__mmask32)(U), (int)(R)))
2493 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2495 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2496 (__mmask32)(U), (int)(R)))
2498 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2500 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)-1, (int)(R)))
2503 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2504 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2505 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2506 (__mmask32)(U), (int)(R)))
2511 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2518 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2525 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2532 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2540 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2547 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2554 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2555 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2562 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2569 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2576 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2584 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2591 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2592 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2596 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2597 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2598 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2599 (__mmask32)-1, (int)(R)))
2601 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2602 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2603 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2604 (__mmask32)(U), (int)(R)))
2606 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2607 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2608 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2609 (__mmask32)(U), (int)(R)))
2611 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2612 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2613 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2614 (__mmask32)(U), (int)(R)))
2616 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2619 (__mmask32)-1, (int)(R)))
2621 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2626 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2627 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2629 (__mmask32)(U), (int)(R)))
2633 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2634 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2640 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2641 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2647 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2648 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2654 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2655 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2661 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2662 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2668 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2669 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2675 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2676 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2680 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2681 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2682 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2683 (__mmask32)(U), (int)(R)))
2687 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2692 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2693 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2694 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2695 (__mmask32)(U), (int)(R)))
2699 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2700 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2704 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2705 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2706 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2707 (__mmask32)(U), (int)(R)))
2711 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2716 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2717 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2718 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2719 (__mmask32)(U), (int)(R)))
2721 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2722 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2723 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2724 (__mmask32)(U), (int)(R)))
2728 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2735 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2743 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2751 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755 #define _mm_fmadd_round_sh(A, B, C, R) \
2756 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2757 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2758 (__mmask8)-1, (int)(R)))
2760 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2761 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2762 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2763 (__mmask8)(U), (int)(R)))
2767 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2772 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2773 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2774 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2775 (__mmask8)(U), (int)(R)))
2779 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2784 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2785 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2786 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2787 (__mmask8)(U), (int)(R)))
2792 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2801 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2806 #define _mm_fmsub_round_sh(A, B, C, R) \
2807 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2808 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2809 (__mmask8)-1, (int)(R)))
2811 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2812 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2813 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2814 (__mmask8)(U), (int)(R)))
2818 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2823 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2824 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2825 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2826 (__mmask8)(U), (int)R))
2830 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2835 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2836 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2837 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2838 (__mmask8)(U), (int)(R)))
2843 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2849 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853 #define _mm_fnmadd_round_sh(A, B, C, R) \
2854 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2855 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2856 (__mmask8)-1, (int)(R)))
2858 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2859 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2860 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2861 (__mmask8)(U), (int)(R)))
2865 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2870 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2871 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2872 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2873 (__mmask8)(U), (int)(R)))
2877 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2882 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2883 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2884 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2885 (__mmask8)(U), (int)(R)))
2890 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2896 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900 #define _mm_fnmsub_round_sh(A, B, C, R) \
2901 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2902 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2903 (__mmask8)-1, (int)(R)))
2905 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2906 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2907 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2908 (__mmask8)(U), (int)(R)))
2912 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2917 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2918 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2919 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2920 (__mmask8)(U), (int)(R)))
2924 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2929 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2930 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2931 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2932 (__mmask8)(U), (int)(R)))
2937 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2944 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2950 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2957 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2961 #define _mm_fcmadd_round_sch(A, B, C, R) \
2962 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2963 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2964 (__mmask8)-1, (int)(R)))
2966 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2967 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2968 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2969 (__mmask8)(U), (int)(R)))
2971 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2972 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2973 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2974 (__mmask8)(U), (int)(R)))
2976 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2977 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2978 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2979 (__mmask8)(U), (int)(R)))
2984 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2991 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2997 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3004 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3008 #define _mm_fmadd_round_sch(A, B, C, R) \
3009 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3010 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3011 (__mmask8)-1, (int)(R)))
3013 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3014 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3015 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3016 (__mmask8)(U), (int)(R)))
3018 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3019 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3020 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3021 (__mmask8)(U), (int)(R)))
3023 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3024 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3025 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3026 (__mmask8)(U), (int)(R)))
3030 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3037 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3044 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3049 #define _mm_fcmul_round_sch(A, B, R) \
3050 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3051 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3052 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3054 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3055 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3056 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3057 (__mmask8)(U), (int)(R)))
3059 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3060 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3061 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3062 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3066 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3075 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3082 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3087 #define _mm_fmul_round_sch(A, B, R) \
3088 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3089 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3090 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3092 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3093 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3094 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3095 (__mmask8)(U), (int)(R)))
3097 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3098 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3099 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3100 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3104 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3111 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3118 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3123 #define _mm512_fcmul_round_pch(A, B, R) \
3124 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3125 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3126 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3128 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3129 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3130 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3131 (__mmask16)(U), (int)(R)))
3133 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3134 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3135 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3136 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3140 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3147 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3154 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3159 #define _mm512_fmul_round_pch(A, B, R) \
3160 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3161 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3162 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3164 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3165 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3166 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3167 (__mmask16)(U), (int)(R)))
3169 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3170 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3171 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3172 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3177 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3178 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3184 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3185 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3191 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3192 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3198 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3199 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3203 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3204 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3205 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3206 (__mmask16)-1, (int)(R)))
3208 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3209 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3210 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3211 (__mmask16)(U), (int)(R)))
3213 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3214 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3215 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3216 (__mmask16)(U), (int)(R)))
3218 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3219 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3220 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3221 (__mmask16)(U), (int)(R)))
3226 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3233 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3240 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3241 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3247 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3248 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3252 #define _mm512_fmadd_round_pch(A, B, C, R) \
3253 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3254 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3255 (__mmask16)-1, (int)(R)))
3257 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3258 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3259 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3260 (__mmask16)(U), (int)(R)))
3262 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3263 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3264 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3265 (__mmask16)(U), (int)(R)))
3267 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3268 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3269 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3270 (__mmask16)(U), (int)(R)))
3274 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3279 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3284 return __builtin_ia32_reduce_fmax_ph512(__V);
3289 return __builtin_ia32_reduce_fmin_ph512(__V);
3294 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3300 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3306 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3310 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3311 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3312 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3313 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3314 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3315 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3316 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3317 _mm512_maskz_fmul_round_pch(U, A, B, R)
3319 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3320 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3321 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3322 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3323 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3324 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3325 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3326 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3328 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3329 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3330 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3331 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3332 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3333 _mm_mask_fmul_round_sch(W, U, A, B, R)
3334 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3336 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3337 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3338 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3339 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3340 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3341 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3342 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3343 _mm_maskz_fcmul_round_sch(U, A, B, R)
3345 #undef __DEFAULT_FN_ATTRS128
3346 #undef __DEFAULT_FN_ATTRS256
3347 #undef __DEFAULT_FN_ATTRS512