11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
14 #ifndef __AVX512VLFP16INTRIN_H
15 #define __AVX512VLFP16INTRIN_H
18 #define __DEFAULT_FN_ATTRS256 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512fp16, avx512vl"), \
21 __min_vector_width__(256)))
22 #define __DEFAULT_FN_ATTRS128 \
23 __attribute__((__always_inline__, __nodebug__, \
24 __target__("avx512fp16, avx512vl"), \
25 __min_vector_width__(128)))
36 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
40 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
44 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
45 __h, __h, __h, __h, __h, __h, __h, __h};
51 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
61 return (__m128h)
_mm_set1_ps(__builtin_bit_cast(
float, h));
69 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
70 __h10, __h9, __h8, __h7, __h6, __h5,
71 __h4, __h3, __h2, __h1};
74 #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
75 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
77 #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
79 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
80 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
84 return (__m256h)((__v16hf)__A + (__v16hf)__B);
89 return (__m256h)__builtin_ia32_selectph_256(
95 return (__m256h)__builtin_ia32_selectph_256(
101 return (__m128h)((__v8hf)__A + (__v8hf)__B);
108 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_add_ph(__A, __B),
115 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_add_ph(__A, __B),
121 return (__m256h)((__v16hf)__A - (__v16hf)__B);
126 return (__m256h)__builtin_ia32_selectph_256(
132 return (__m256h)__builtin_ia32_selectph_256(
138 return (__m128h)((__v8hf)__A - (__v8hf)__B);
145 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_sub_ph(__A, __B),
152 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_sub_ph(__A, __B),
158 return (__m256h)((__v16hf)__A * (__v16hf)__B);
163 return (__m256h)__builtin_ia32_selectph_256(
169 return (__m256h)__builtin_ia32_selectph_256(
175 return (__m128h)((__v8hf)__A * (__v8hf)__B);
182 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_mul_ph(__A, __B),
189 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_mul_ph(__A, __B),
195 return (__m256h)((__v16hf)__A / (__v16hf)__B);
200 return (__m256h)__builtin_ia32_selectph_256(
206 return (__m256h)__builtin_ia32_selectph_256(
212 return (__m128h)((__v8hf)__A / (__v8hf)__B);
219 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_div_ph(__A, __B),
226 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)
_mm_div_ph(__A, __B),
232 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
237 return (__m256h)__builtin_ia32_selectph_256(
239 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
245 return (__m256h)__builtin_ia32_selectph_256(
247 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
253 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
260 return (__m128h)__builtin_ia32_selectph_128(
261 (
__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
268 return (__m128h)__builtin_ia32_selectph_128(
269 (
__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
275 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
280 return (__m256h)__builtin_ia32_selectph_256(
282 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
288 return (__m256h)__builtin_ia32_selectph_256(
290 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
296 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
303 return (__m128h)__builtin_ia32_selectph_128(
304 (
__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
311 return (__m128h)__builtin_ia32_selectph_128(
312 (
__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
330 return (__m256h)__builtin_ia32_selectps_256(
336 return (__m256h)__builtin_ia32_selectps_256(
347 return (__m128h)__builtin_ia32_selectps_128(
353 return (__m128h)__builtin_ia32_selectps_128(
357 #define _mm256_cmp_ph_mask(a, b, p) \
358 ((__mmask16)__builtin_ia32_cmpph256_mask( \
359 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
361 #define _mm256_mask_cmp_ph_mask(m, a, b, p) \
362 ((__mmask16)__builtin_ia32_cmpph256_mask( \
363 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
365 #define _mm_cmp_ph_mask(a, b, p) \
366 ((__mmask8)__builtin_ia32_cmpph128_mask( \
367 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
369 #define _mm_mask_cmp_ph_mask(m, a, b, p) \
370 ((__mmask8)__builtin_ia32_cmpph128_mask( \
371 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
374 return (__m256h)__builtin_ia32_rcpph256_mask(
380 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
386 return (__m256h)__builtin_ia32_rcpph256_mask(
391 return (__m128h)__builtin_ia32_rcpph128_mask(
398 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
404 return (__m128h)__builtin_ia32_rcpph128_mask(
409 return (__m256h)__builtin_ia32_rsqrtph256_mask(
415 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
421 return (__m256h)__builtin_ia32_rsqrtph256_mask(
426 return (__m128h)__builtin_ia32_rsqrtph128_mask(
433 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
439 return (__m128h)__builtin_ia32_rsqrtph128_mask(
444 return (__m128h)__builtin_ia32_getexpph128_mask(
450 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
456 return (__m128h)__builtin_ia32_getexpph128_mask(
461 return (__m256h)__builtin_ia32_getexpph256_mask(
467 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
473 return (__m256h)__builtin_ia32_getexpph256_mask(
477 #define _mm_getmant_ph(A, B, C) \
478 ((__m128h)__builtin_ia32_getmantph128_mask( \
479 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
482 #define _mm_mask_getmant_ph(W, U, A, B, C) \
483 ((__m128h)__builtin_ia32_getmantph128_mask( \
484 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
487 #define _mm_maskz_getmant_ph(U, A, B, C) \
488 ((__m128h)__builtin_ia32_getmantph128_mask( \
489 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
492 #define _mm256_getmant_ph(A, B, C) \
493 ((__m256h)__builtin_ia32_getmantph256_mask( \
494 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
495 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
497 #define _mm256_mask_getmant_ph(W, U, A, B, C) \
498 ((__m256h)__builtin_ia32_getmantph256_mask( \
499 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
502 #define _mm256_maskz_getmant_ph(U, A, B, C) \
503 ((__m256h)__builtin_ia32_getmantph256_mask( \
504 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
505 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
509 return (__m128h)__builtin_ia32_scalefph128_mask(
515 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
521 return (__m128h)__builtin_ia32_scalefph128_mask(
527 return (__m256h)__builtin_ia32_scalefph256_mask(
533 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
539 return (__m256h)__builtin_ia32_scalefph256_mask(
543 #define _mm_roundscale_ph(A, imm) \
544 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
545 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
548 #define _mm_mask_roundscale_ph(W, U, A, imm) \
549 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
550 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
552 #define _mm_maskz_roundscale_ph(U, A, imm) \
553 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
554 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
557 #define _mm256_roundscale_ph(A, imm) \
558 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
559 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
562 #define _mm256_mask_roundscale_ph(W, U, A, imm) \
563 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
564 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
567 #define _mm256_maskz_roundscale_ph(U, A, imm) \
568 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
569 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
572 #define _mm_reduce_ph(A, imm) \
573 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
574 (__v8hf)_mm_setzero_ph(), \
577 #define _mm_mask_reduce_ph(W, U, A, imm) \
578 ((__m128h)__builtin_ia32_reduceph128_mask( \
579 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
581 #define _mm_maskz_reduce_ph(U, A, imm) \
582 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
583 (__v8hf)_mm_setzero_ph(), \
586 #define _mm256_reduce_ph(A, imm) \
587 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
588 (__v16hf)_mm256_setzero_ph(), \
591 #define _mm256_mask_reduce_ph(W, U, A, imm) \
592 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
593 (__v16hf)(__m256h)(W), \
596 #define _mm256_maskz_reduce_ph(U, A, imm) \
597 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
598 (__v16hf)_mm256_setzero_ph(), \
602 return __builtin_ia32_sqrtph((__v8hf)
__a);
608 return (__m128h)__builtin_ia32_selectph_128(
614 return (__m128h)__builtin_ia32_selectph_128(
619 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)
__a);
624 return (__m256h)__builtin_ia32_selectph_256(
630 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
635 #define _mm_mask_fpclass_ph_mask(U, A, imm) \
636 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
637 (int)(imm), (__mmask8)(U)))
639 #define _mm_fpclass_ph_mask(A, imm) \
640 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
641 (int)(imm), (__mmask8)-1))
643 #define _mm256_mask_fpclass_ph_mask(U, A, imm) \
644 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
645 (int)(imm), (__mmask16)(U)))
647 #define _mm256_fpclass_ph_mask(A, imm) \
648 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
649 (int)(imm), (__mmask16)-1))
652 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
659 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
665 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
670 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
676 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
682 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
687 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
694 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
700 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
705 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
711 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
717 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
722 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
728 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
734 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
740 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
746 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
752 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
757 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
763 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
769 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
775 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
781 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
787 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
792 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
797 return (__m128h)__builtin_ia32_selectph_128(
803 return (__m128h)__builtin_ia32_selectph_128(
809 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
814 return (__m256h)__builtin_ia32_selectph_256(
820 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
826 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
832 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
838 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
844 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
850 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
856 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
861 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
867 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
873 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
879 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
885 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
891 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
896 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
901 return (__m128h)__builtin_ia32_selectph_128(
907 return (__m128h)__builtin_ia32_selectph_128(
913 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
918 return (__m256h)__builtin_ia32_selectph_256(
924 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U,
930 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
936 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
942 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
948 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
954 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
960 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
965 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
971 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
977 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
983 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
989 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
995 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
1000 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1006 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
1012 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1018 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1023 return (__m128h)__builtin_ia32_selectph_128(
1029 return (__m128h)__builtin_ia32_selectph_128(
1034 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1040 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1046 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1052 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1057 return (__m128h)__builtin_ia32_selectph_128(
1063 return (__m128h)__builtin_ia32_selectph_128(
1068 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1074 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1080 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1086 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1092 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1098 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1103 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1109 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1115 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1121 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1127 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1133 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1138 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1144 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1150 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1156 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1162 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1168 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1173 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1179 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1185 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1191 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1197 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1203 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1208 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1214 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1220 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1226 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1232 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1238 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1243 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1249 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1255 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1261 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1267 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1273 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1278 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1284 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1290 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1296 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1302 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1308 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1313 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1319 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1325 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1331 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1337 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1343 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1348 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1355 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1361 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1366 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1372 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1378 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1383 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1390 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1396 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1401 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1407 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1413 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1420 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1428 return (__m128h)__builtin_ia32_selectph_128(
1430 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1436 return (__m128h)__builtin_ia32_selectph_128(
1438 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1444 return (__m128h)__builtin_ia32_selectph_128(
1446 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1453 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1461 return (__m128h)__builtin_ia32_selectph_128(
1468 return (__m128h)__builtin_ia32_selectph_128(
1475 return (__m128h)__builtin_ia32_selectph_128(
1477 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1483 return (__m128h)__builtin_ia32_selectph_128(
1485 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1491 return (__m128h)__builtin_ia32_selectph_128(
1493 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1500 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1506 return (__m256h)__builtin_ia32_selectph_256(
1508 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1514 return (__m256h)__builtin_ia32_selectph_256(
1516 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1522 return (__m256h)__builtin_ia32_selectph_256(
1524 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1531 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1537 return (__m256h)__builtin_ia32_selectph_256(
1539 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1545 return (__m256h)__builtin_ia32_selectph_256(
1547 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1553 return (__m256h)__builtin_ia32_selectph_256(
1555 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1561 return (__m256h)__builtin_ia32_selectph_256(
1563 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1569 return (__m256h)__builtin_ia32_selectph_256(
1571 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1578 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1584 return (__m128h)__builtin_ia32_selectph_128(
1586 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1592 return (__m128h)__builtin_ia32_selectph_128(
1594 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1600 return (__m128h)__builtin_ia32_selectph_128(
1602 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1609 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1615 return (__m128h)__builtin_ia32_selectph_128(
1617 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1623 return (__m128h)__builtin_ia32_selectph_128(
1625 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1631 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1637 return (__m256h)__builtin_ia32_selectph_256(
1639 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1645 return (__m256h)__builtin_ia32_selectph_256(
1647 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1653 return (__m256h)__builtin_ia32_selectph_256(
1655 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1661 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1667 return (__m256h)__builtin_ia32_selectph_256(
1669 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1675 return (__m256h)__builtin_ia32_selectph_256(
1677 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1683 return (__m128h)__builtin_ia32_selectph_128(
1685 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1691 return (__m256h)__builtin_ia32_selectph_256(
1693 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1699 return (__m128h)__builtin_ia32_selectph_128(
1701 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1707 return (__m256h)__builtin_ia32_selectph_256(
1709 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1716 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1722 return (__m128h)__builtin_ia32_selectph_128(
1724 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1731 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1737 return (__m256h)__builtin_ia32_selectph_256(
1739 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1746 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1752 return (__m128h)__builtin_ia32_selectph_128(
1754 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1760 return (__m128h)__builtin_ia32_selectph_128(
1762 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1769 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1775 return (__m256h)__builtin_ia32_selectph_256(
1777 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1783 return (__m256h)__builtin_ia32_selectph_256(
1785 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1791 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1797 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1803 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1809 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1815 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1821 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1828 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1834 return (__m128h)__builtin_ia32_selectps_128(
1836 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
1843 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1849 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
1850 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (
__mmask8)__U);
1856 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1862 return (__m256h)__builtin_ia32_selectps_256(
1864 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1871 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1877 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
1878 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (
__mmask8)__U);
1883 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1891 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1897 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1903 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1909 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1915 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1922 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1928 return (__m128h)__builtin_ia32_selectps_128(
1930 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
1937 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1943 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
1950 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1956 return (__m256h)__builtin_ia32_selectps_256(
1958 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1965 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1971 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
1978 return (__m128h)__builtin_ia32_selectph_128((
__mmask8)__U, (__v8hf)__W,
1984 return (__m256h)__builtin_ia32_selectph_256((
__mmask16)__U, (__v16hf)__W,
1990 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1996 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
2002 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
2007 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
2012 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2017 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2022 return __builtin_ia32_reduce_fmax_ph256(__V);
2027 return __builtin_ia32_reduce_fmin_ph256(__V);
2032 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2037 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2042 return __builtin_ia32_reduce_fmax_ph128(__V);
2047 return __builtin_ia32_reduce_fmin_ph128(__V);
2051 #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2052 #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2053 #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2054 #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2055 #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2056 #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2058 #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2059 #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2060 #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2061 #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2062 #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2063 #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2065 #undef __DEFAULT_FN_ATTRS128
2066 #undef __DEFAULT_FN_ATTRS256