10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
45_mm512_cvtsh_h(__m512h
__a) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
54_mm256_setzero_ph(
void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
60 return (__m256h)__builtin_ia32_undef256();
64_mm512_setzero_ph(
void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
71 return (__m128h)__builtin_ia32_undef128();
75 return (__m512h)__builtin_ia32_undef512();
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
116_mm512_set1_pch(
_Float16 _Complex __h) {
149_mm256_castph_si256(__m256h
__a) {
154_mm512_castph_si512(__m512h
__a) {
187_mm256_castsi256_ph(__m256i
__a) {
192_mm512_castsi512_ph(__m512i
__a) {
197_mm256_castph256_ph128(__m256h
__a) {
198 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
202_mm512_castph512_ph128(__m512h
__a) {
203 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
207_mm512_castph512_ph256(__m512h
__a) {
208 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
213_mm256_castph128_ph256(__m128h
__a) {
214 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
219_mm512_castph128_ph512(__m128h
__a) {
220 __m256h
__b = __builtin_nondeterministic_value(
__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
229_mm512_castph256_ph512(__m256h
__a) {
230 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
250_mm256_zextph128_ph256(__m128h
__a) {
251 return __builtin_shufflevector(
__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
269_mm512_zextph128_ph512(__m128h
__a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
289_mm512_zextph256_ph512(__m256h
__a) {
290 return __builtin_shufflevector(
__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OS,
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OS,
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OS,
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OS,
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OS,
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_US,
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OQ,
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OQ,
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OQ,
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OQ,
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OQ,
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_UQ,
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
380_mm512_mask_add_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (
__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
386_mm512_maskz_add_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
412_mm512_mask_sub_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (
__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
418_mm512_maskz_sub_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
444_mm512_mask_mul_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (
__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
450_mm512_maskz_mul_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
476_mm512_mask_div_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (
__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
482_mm512_maskz_div_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
509_mm512_mask_min_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (
__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
515_mm512_maskz_min_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
542_mm512_mask_max_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (
__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
548_mm512_maskz_max_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
569_mm512_abs_ph(__m512h __A) {
579_mm512_mask_conj_pch(__m512h __W,
__mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (
__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
585_mm512_maskz_conj_pch(
__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
592_mm_add_sh(__m128h __A, __m128h __B) {
598_mm_mask_add_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
599 __A = _mm_add_sh(__A, __B);
600 return __builtin_ia32_selectsh_128(__U, __A, __W);
604_mm_maskz_add_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
605 __A = _mm_add_sh(__A, __B);
606 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
609#define _mm_add_round_sh(A, B, R) \
610 ((__m128h)__builtin_ia32_addsh_round_mask( \
611 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
612 (__mmask8)-1, (int)(R)))
614#define _mm_mask_add_round_sh(W, U, A, B, R) \
615 ((__m128h)__builtin_ia32_addsh_round_mask( \
616 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
617 (__mmask8)(U), (int)(R)))
619#define _mm_maskz_add_round_sh(U, A, B, R) \
620 ((__m128h)__builtin_ia32_addsh_round_mask( \
621 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
622 (__mmask8)(U), (int)(R)))
624static __inline__ __m128h
631_mm_mask_sub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
632 __A = _mm_sub_sh(__A, __B);
633 return __builtin_ia32_selectsh_128(__U, __A, __W);
637_mm_maskz_sub_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
638 __A = _mm_sub_sh(__A, __B);
639 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
642#define _mm_sub_round_sh(A, B, R) \
643 ((__m128h)__builtin_ia32_subsh_round_mask( \
644 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
645 (__mmask8)-1, (int)(R)))
647#define _mm_mask_sub_round_sh(W, U, A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
650 (__mmask8)(U), (int)(R)))
652#define _mm_maskz_sub_round_sh(U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
655 (__mmask8)(U), (int)(R)))
657static __inline__ __m128h
664_mm_mask_mul_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
665 __A = _mm_mul_sh(__A, __B);
666 return __builtin_ia32_selectsh_128(__U, __A, __W);
670_mm_maskz_mul_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
671 __A = _mm_mul_sh(__A, __B);
672 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
675#define _mm_mul_round_sh(A, B, R) \
676 ((__m128h)__builtin_ia32_mulsh_round_mask( \
677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678 (__mmask8)-1, (int)(R)))
680#define _mm_mask_mul_round_sh(W, U, A, B, R) \
681 ((__m128h)__builtin_ia32_mulsh_round_mask( \
682 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
683 (__mmask8)(U), (int)(R)))
685#define _mm_maskz_mul_round_sh(U, A, B, R) \
686 ((__m128h)__builtin_ia32_mulsh_round_mask( \
687 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
688 (__mmask8)(U), (int)(R)))
690static __inline__ __m128h
697_mm_mask_div_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
698 __A = _mm_div_sh(__A, __B);
699 return __builtin_ia32_selectsh_128(__U, __A, __W);
703_mm_maskz_div_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
704 __A = _mm_div_sh(__A, __B);
705 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
708#define _mm_div_round_sh(A, B, R) \
709 ((__m128h)__builtin_ia32_divsh_round_mask( \
710 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
711 (__mmask8)-1, (int)(R)))
713#define _mm_mask_div_round_sh(W, U, A, B, R) \
714 ((__m128h)__builtin_ia32_divsh_round_mask( \
715 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
716 (__mmask8)(U), (int)(R)))
718#define _mm_maskz_div_round_sh(U, A, B, R) \
719 ((__m128h)__builtin_ia32_divsh_round_mask( \
720 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
721 (__mmask8)(U), (int)(R)))
725 return (__m128h)__builtin_ia32_minsh_round_mask(
726 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
734 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
742 return (__m128h)__builtin_ia32_minsh_round_mask(
743 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
747#define _mm_min_round_sh(A, B, R) \
748 ((__m128h)__builtin_ia32_minsh_round_mask( \
749 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
750 (__mmask8)-1, (int)(R)))
752#define _mm_mask_min_round_sh(W, U, A, B, R) \
753 ((__m128h)__builtin_ia32_minsh_round_mask( \
754 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
755 (__mmask8)(U), (int)(R)))
757#define _mm_maskz_min_round_sh(U, A, B, R) \
758 ((__m128h)__builtin_ia32_minsh_round_mask( \
759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
760 (__mmask8)(U), (int)(R)))
764 return (__m128h)__builtin_ia32_maxsh_round_mask(
765 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
773 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
781 return (__m128h)__builtin_ia32_maxsh_round_mask(
782 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
786#define _mm_max_round_sh(A, B, R) \
787 ((__m128h)__builtin_ia32_maxsh_round_mask( \
788 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
789 (__mmask8)-1, (int)(R)))
791#define _mm_mask_max_round_sh(W, U, A, B, R) \
792 ((__m128h)__builtin_ia32_maxsh_round_mask( \
793 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
794 (__mmask8)(U), (int)(R)))
796#define _mm_maskz_max_round_sh(U, A, B, R) \
797 ((__m128h)__builtin_ia32_maxsh_round_mask( \
798 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
799 (__mmask8)(U), (int)(R)))
801#define _mm512_cmp_round_ph_mask(A, B, P, R) \
802 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
803 (__v32hf)(__m512h)(B), (int)(P), \
804 (__mmask32)-1, (int)(R)))
806#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
807 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
808 (__v32hf)(__m512h)(B), (int)(P), \
809 (__mmask32)(U), (int)(R)))
811#define _mm512_cmp_ph_mask(A, B, P) \
812 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
814#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
815 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
817#define _mm_cmp_round_sh_mask(X, Y, P, R) \
818 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
819 (__v8hf)(__m128h)(Y), (int)(P), \
820 (__mmask8)-1, (int)(R)))
822#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
823 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
824 (__v8hf)(__m128h)(Y), (int)(P), \
825 (__mmask8)(M), (int)(R)))
827#define _mm_cmp_sh_mask(X, Y, P) \
828 ((__mmask8)__builtin_ia32_cmpsh_mask( \
829 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
830 _MM_FROUND_CUR_DIRECTION))
832#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
833 ((__mmask8)__builtin_ia32_cmpsh_mask( \
834 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
835 _MM_FROUND_CUR_DIRECTION))
838 struct __mm_load_sh_struct {
841 _Float16 __u = ((
const struct __mm_load_sh_struct *)__dp)->__u;
842 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
846_mm_mask_load_sh(__m128h __W,
__mmask8 __U,
const void *__A) {
847 __m128h src = (__v8hf)__builtin_shufflevector(
848 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
850 return (__m128h)__builtin_ia32_loadsh128_mask((
const __v8hf *)__A, src, __U & 1);
854_mm_maskz_load_sh(
__mmask8 __U,
const void *__A) {
855 return (__m128h)__builtin_ia32_loadsh128_mask(
856 (
const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
860_mm512_load_ph(
void const *
__p) {
861 return *(
const __m512h *)
__p;
865_mm256_load_ph(
void const *
__p) {
866 return *(
const __m256h *)
__p;
870 return *(
const __m128h *)
__p;
874_mm512_loadu_ph(
void const *
__p) {
878 return ((
const struct __loadu_ph *)
__p)->__v;
882_mm256_loadu_ph(
void const *
__p) {
886 return ((
const struct __loadu_ph *)
__p)->__v;
893 return ((
const struct __loadu_ph *)
__p)->__v;
899 struct __mm_store_sh_struct {
902 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
908 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
913 *(__m512h *)
__P = __A;
918 *(__m256h *)
__P = __A;
923 *(__m128h *)
__P = __A;
931 ((
struct __storeu_ph *)
__P)->
__v = __A;
939 ((
struct __storeu_ph *)
__P)->
__v = __A;
947 ((
struct __storeu_ph *)
__P)->
__v = __A;
952_mm_move_sh(__m128h
__a, __m128h
__b) {
958_mm_mask_move_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
959 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
963_mm_maskz_move_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
964 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
970 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
979 return (__m512h)__builtin_ia32_rcpph512_mask(
980 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
984_mm512_mask_rcp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
985 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
990_mm512_maskz_rcp_ph(
__mmask32 __U, __m512h __A) {
991 return (__m512h)__builtin_ia32_rcpph512_mask(
992 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
996 return (__m512h)__builtin_ia32_rsqrtph512_mask(
997 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
1001_mm512_mask_rsqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1007_mm512_maskz_rsqrt_ph(
__mmask32 __U, __m512h __A) {
1008 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1009 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1012#define _mm512_getmant_ph(A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1015 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1016 _MM_FROUND_CUR_DIRECTION))
1018#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1019 ((__m512h)__builtin_ia32_getmantph512_mask( \
1020 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1021 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1023#define _mm512_maskz_getmant_ph(U, A, B, C) \
1024 ((__m512h)__builtin_ia32_getmantph512_mask( \
1025 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1026 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1028#define _mm512_getmant_round_ph(A, B, C, R) \
1029 ((__m512h)__builtin_ia32_getmantph512_mask( \
1030 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1031 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1033#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1034 ((__m512h)__builtin_ia32_getmantph512_mask( \
1035 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1036 (__mmask32)(U), (int)(R)))
1038#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1039 ((__m512h)__builtin_ia32_getmantph512_mask( \
1040 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1041 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1044 return (__m512h)__builtin_ia32_getexpph512_mask(
1045 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1050_mm512_mask_getexp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1056_mm512_maskz_getexp_ph(
__mmask32 __U, __m512h __A) {
1057 return (__m512h)__builtin_ia32_getexpph512_mask(
1058 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1062#define _mm512_getexp_round_ph(A, R) \
1063 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1064 (__v32hf)_mm512_undefined_ph(), \
1065 (__mmask32)-1, (int)(R)))
1067#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1068 ((__m512h)__builtin_ia32_getexpph512_mask( \
1069 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1071#define _mm512_maskz_getexp_round_ph(U, A, R) \
1072 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1073 (__v32hf)_mm512_setzero_ph(), \
1074 (__mmask32)(U), (int)(R)))
1078 return (__m512h)__builtin_ia32_scalefph512_mask(
1079 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1084_mm512_mask_scalef_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
1085 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1091_mm512_maskz_scalef_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
1092 return (__m512h)__builtin_ia32_scalefph512_mask(
1093 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1097#define _mm512_scalef_round_ph(A, B, R) \
1098 ((__m512h)__builtin_ia32_scalefph512_mask( \
1099 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1100 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1102#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1103 ((__m512h)__builtin_ia32_scalefph512_mask( \
1104 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1105 (__mmask32)(U), (int)(R)))
1107#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1108 ((__m512h)__builtin_ia32_scalefph512_mask( \
1109 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1110 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1112#define _mm512_roundscale_ph(A, B) \
1113 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1114 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1115 _MM_FROUND_CUR_DIRECTION))
1117#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1118 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1119 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1120 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1122#define _mm512_maskz_roundscale_ph(A, B, imm) \
1123 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1124 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1125 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1127#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1129 (__v32hf)(__m512h)(A), \
1130 (__mmask32)(B), (int)(R)))
1132#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1133 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1134 (__v32hf)_mm512_setzero_ph(), \
1135 (__mmask32)(A), (int)(R)))
1137#define _mm512_roundscale_round_ph(A, imm, R) \
1138 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1139 (__v32hf)_mm512_undefined_ph(), \
1140 (__mmask32)-1, (int)(R)))
1142#define _mm512_reduce_ph(A, imm) \
1143 ((__m512h)__builtin_ia32_reduceph512_mask( \
1144 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1145 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1147#define _mm512_mask_reduce_ph(W, U, A, imm) \
1148 ((__m512h)__builtin_ia32_reduceph512_mask( \
1149 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1150 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1152#define _mm512_maskz_reduce_ph(U, A, imm) \
1153 ((__m512h)__builtin_ia32_reduceph512_mask( \
1154 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1155 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1157#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159 (__v32hf)(__m512h)(W), \
1160 (__mmask32)(U), (int)(R)))
1162#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1163 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1164 (__v32hf)_mm512_setzero_ph(), \
1165 (__mmask32)(U), (int)(R)))
1167#define _mm512_reduce_round_ph(A, imm, R) \
1168 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1169 (__v32hf)_mm512_undefined_ph(), \
1170 (__mmask32)-1, (int)(R)))
1174 return (__m128h)__builtin_ia32_rcpsh_mask(
1175 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1182 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1189 return (__m128h)__builtin_ia32_rcpsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1195 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1196 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1208_mm_maskz_rsqrt_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1209 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1210 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1213#define _mm_getmant_round_sh(A, B, C, D, R) \
1214 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1215 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1216 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1218#define _mm_getmant_sh(A, B, C, D) \
1219 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1220 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1221 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1223#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1224 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1225 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1226 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1228#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1233#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1234 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1236 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1238#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1239 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1240 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1241 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1243#define _mm_getexp_round_sh(A, B, R) \
1244 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1245 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1246 (__mmask8)-1, (int)(R)))
1250 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1251 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1256_mm_mask_getexp_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1257 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1258 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1262#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1263 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1264 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1265 (__mmask8)(U), (int)(R)))
1268_mm_maskz_getexp_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1269 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1270 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1274#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1275 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1276 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1277 (__mmask8)(U), (int)(R)))
1279#define _mm_scalef_round_sh(A, B, R) \
1280 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1281 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1282 (__mmask8)-1, (int)(R)))
1286 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1287 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1292_mm_mask_scalef_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1293 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1298#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1299 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1300 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1301 (__mmask8)(U), (int)(R)))
1304_mm_maskz_scalef_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1305 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1306 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1310#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1311 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1312 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1313 (__mmask8)(U), (int)(R)))
1315#define _mm_roundscale_round_sh(A, B, imm, R) \
1316 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1317 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1318 (__mmask8)-1, (int)(imm), (int)(R)))
1320#define _mm_roundscale_sh(A, B, imm) \
1321 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1322 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1323 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1325#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1326 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1328 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1330#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1333 (__mmask8)(U), (int)(I), (int)(R)))
1335#define _mm_maskz_roundscale_sh(U, A, B, I) \
1336 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1340#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1341 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1343 (__mmask8)(U), (int)(I), (int)(R)))
1345#define _mm_reduce_sh(A, B, C) \
1346 ((__m128h)__builtin_ia32_reducesh_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1348 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1350#define _mm_mask_reduce_sh(W, U, A, B, C) \
1351 ((__m128h)__builtin_ia32_reducesh_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1353 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1355#define _mm_maskz_reduce_sh(U, A, B, C) \
1356 ((__m128h)__builtin_ia32_reducesh_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1358 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1360#define _mm_reduce_round_sh(A, B, C, R) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)-1, (int)(C), (int)(R)))
1365#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1366 ((__m128h)__builtin_ia32_reducesh_mask( \
1367 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1368 (__mmask8)(U), (int)(C), (int)(R)))
1370#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1371 ((__m128h)__builtin_ia32_reducesh_mask( \
1372 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1373 (__mmask8)(U), (int)(C), (int)(R)))
1375#define _mm512_sqrt_round_ph(A, R) \
1376 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1378#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1379 ((__m512h)__builtin_ia32_selectph_512( \
1380 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1381 (__v32hf)(__m512h)(W)))
1383#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1384 ((__m512h)__builtin_ia32_selectph_512( \
1385 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1386 (__v32hf)_mm512_setzero_ph()))
1389 return (__m512h)__builtin_elementwise_sqrt((__v32hf)__A);
1393_mm512_mask_sqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1394 return (__m512h)__builtin_ia32_selectph_512(
1395 (
__mmask32)(__U), (__v32hf)_mm512_sqrt_ph(__A), (__v32hf)(__m512h)(__W));
1399_mm512_maskz_sqrt_ph(
__mmask32 __U, __m512h __A) {
1400 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)(__U),
1401 (__v32hf)_mm512_sqrt_ph(__A),
1402 (__v32hf)_mm512_setzero_ph());
1405#define _mm_sqrt_round_sh(A, B, R) \
1406 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1407 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1408 (__mmask8)-1, (int)(R)))
1410#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1411 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1412 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1413 (__mmask8)(U), (int)(R)))
1415#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1416 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1417 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1418 (__mmask8)(U), (int)(R)))
1422 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1423 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1431 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1432 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1439 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1440 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1444#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1445 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1446 (int)(imm), (__mmask32)(U)))
1448#define _mm512_fpclass_ph_mask(A, imm) \
1449 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1450 (int)(imm), (__mmask32)-1))
1452#define _mm_fpclass_sh_mask(A, imm) \
1453 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1456#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1457 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1460#define _mm512_cvt_roundpd_ph(A, R) \
1461 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1462 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1464#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1465 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1466 (__mmask8)(U), (int)(R)))
1468#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1469 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1470 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1473 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1479_mm512_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1485_mm512_maskz_cvtpd_ph(
__mmask8 __U, __m512d __A) {
1486 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1487 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1491#define _mm512_cvt_roundph_pd(A, R) \
1492 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1493 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1495#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1496 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1497 (__mmask8)(U), (int)(R)))
1499#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1500 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1501 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1504 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1510_mm512_mask_cvtph_pd(__m512d __W,
__mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1516_mm512_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
1517 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1522#define _mm_cvt_roundsh_ss(A, B, R) \
1523 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1524 (__v4sf)_mm_undefined_ps(), \
1525 (__mmask8)(-1), (int)(R)))
1527#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1528 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1529 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1531#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1532 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1533 (__v4sf)_mm_setzero_ps(), \
1534 (__mmask8)(U), (int)(R)))
1538 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1547 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1555 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1560#define _mm_cvt_roundss_sh(A, B, R) \
1561 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1562 (__v8hf)_mm_undefined_ph(), \
1563 (__mmask8)(-1), (int)(R)))
1565#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1566 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1567 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1569#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1570 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1571 (__v8hf)_mm_setzero_ph(), \
1572 (__mmask8)(U), (int)(R)))
1576 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1577 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1585 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1586 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1593 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1594 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1598#define _mm_cvt_roundsd_sh(A, B, R) \
1599 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1600 (__v8hf)_mm_undefined_ph(), \
1601 (__mmask8)(-1), (int)(R)))
1603#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1604 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1605 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1607#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1608 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1609 (__v8hf)_mm_setzero_ph(), \
1610 (__mmask8)(U), (int)(R)))
1614 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1615 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1623 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1629_mm_maskz_cvtsd_sh(
__mmask8 __U, __m128h __A, __m128d __B) {
1630 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1631 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1635#define _mm_cvt_roundsh_sd(A, B, R) \
1636 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1637 (__v2df)_mm_undefined_pd(), \
1638 (__mmask8)(-1), (int)(R)))
1640#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1641 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1642 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1644#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1645 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1646 (__v2df)_mm_setzero_pd(), \
1647 (__mmask8)(U), (int)(R)))
1651 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1660 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1666_mm_maskz_cvtsh_sd(
__mmask8 __U, __m128d __A, __m128h __B) {
1667 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1672#define _mm512_cvt_roundph_epi16(A, R) \
1673 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1674 (__v32hi)_mm512_undefined_epi32(), \
1675 (__mmask32)(-1), (int)(R)))
1677#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1678 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1679 (__mmask32)(U), (int)(R)))
1681#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1682 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1683 (__v32hi)_mm512_setzero_epi32(), \
1684 (__mmask32)(U), (int)(R)))
1687_mm512_cvtph_epi16(__m512h __A) {
1688 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1694_mm512_mask_cvtph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1700_mm512_maskz_cvtph_epi16(
__mmask32 __U, __m512h __A) {
1701 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1706#define _mm512_cvtt_roundph_epi16(A, R) \
1707 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1708 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1711#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1712 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1713 (__mmask32)(U), (int)(R)))
1715#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1716 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1717 (__v32hi)_mm512_setzero_epi32(), \
1718 (__mmask32)(U), (int)(R)))
1721_mm512_cvttph_epi16(__m512h __A) {
1722 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1728_mm512_mask_cvttph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1734_mm512_maskz_cvttph_epi16(
__mmask32 __U, __m512h __A) {
1735 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1740#define _mm512_cvt_roundepi16_ph(A, R) \
1741 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1742 (__v32hf)_mm512_undefined_ph(), \
1743 (__mmask32)(-1), (int)(R)))
1745#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1746 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1747 (__mmask32)(U), (int)(R)))
1749#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1750 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1751 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1754_mm512_cvtepi16_ph(__m512i __A) {
1755 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1761_mm512_mask_cvtepi16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1767_mm512_maskz_cvtepi16_ph(
__mmask32 __U, __m512i __A) {
1768 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1769 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1773#define _mm512_cvt_roundph_epu16(A, R) \
1774 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1775 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1778#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1779 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1780 (__mmask32)(U), (int)(R)))
1782#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1783 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1784 (__v32hu)_mm512_setzero_epi32(), \
1785 (__mmask32)(U), (int)(R)))
1788_mm512_cvtph_epu16(__m512h __A) {
1789 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1795_mm512_mask_cvtph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1801_mm512_maskz_cvtph_epu16(
__mmask32 __U, __m512h __A) {
1802 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1807#define _mm512_cvtt_roundph_epu16(A, R) \
1808 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1809 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1812#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1813 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1814 (__mmask32)(U), (int)(R)))
1816#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1817 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1818 (__v32hu)_mm512_setzero_epi32(), \
1819 (__mmask32)(U), (int)(R)))
1822_mm512_cvttph_epu16(__m512h __A) {
1823 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1829_mm512_mask_cvttph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1835_mm512_maskz_cvttph_epu16(
__mmask32 __U, __m512h __A) {
1836 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1841#define _mm512_cvt_roundepu16_ph(A, R) \
1842 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1843 (__v32hf)_mm512_undefined_ph(), \
1844 (__mmask32)(-1), (int)(R)))
1846#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1847 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1848 (__mmask32)(U), (int)(R)))
1850#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1851 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1852 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1855_mm512_cvtepu16_ph(__m512i __A) {
1856 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1862_mm512_mask_cvtepu16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1868_mm512_maskz_cvtepu16_ph(
__mmask32 __U, __m512i __A) {
1869 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1870 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1874#define _mm512_cvt_roundph_epi32(A, R) \
1875 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1876 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1879#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1880 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1881 (__mmask16)(U), (int)(R)))
1883#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1884 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1885 (__v16si)_mm512_setzero_epi32(), \
1886 (__mmask16)(U), (int)(R)))
1889_mm512_cvtph_epi32(__m256h __A) {
1890 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1896_mm512_mask_cvtph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1902_mm512_maskz_cvtph_epi32(
__mmask16 __U, __m256h __A) {
1903 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1908#define _mm512_cvt_roundph_epu32(A, R) \
1909 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1910 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1913#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1914 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1915 (__mmask16)(U), (int)(R)))
1917#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1918 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1919 (__v16su)_mm512_setzero_epi32(), \
1920 (__mmask16)(U), (int)(R)))
1923_mm512_cvtph_epu32(__m256h __A) {
1924 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1930_mm512_mask_cvtph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1936_mm512_maskz_cvtph_epu32(
__mmask16 __U, __m256h __A) {
1937 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1942#define _mm512_cvt_roundepi32_ph(A, R) \
1943 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1944 (__v16hf)_mm256_undefined_ph(), \
1945 (__mmask16)(-1), (int)(R)))
1947#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1948 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1949 (__mmask16)(U), (int)(R)))
1951#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1952 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1953 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1956_mm512_cvtepi32_ph(__m512i __A) {
1957 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1963_mm512_mask_cvtepi32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1969_mm512_maskz_cvtepi32_ph(
__mmask16 __U, __m512i __A) {
1970 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1971 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
1975#define _mm512_cvt_roundepu32_ph(A, R) \
1976 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1977 (__v16hf)_mm256_undefined_ph(), \
1978 (__mmask16)(-1), (int)(R)))
1980#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1981 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1982 (__mmask16)(U), (int)(R)))
1984#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1985 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1986 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1989_mm512_cvtepu32_ph(__m512i __A) {
1990 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1996_mm512_mask_cvtepu32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2002_mm512_maskz_cvtepu32_ph(
__mmask16 __U, __m512i __A) {
2003 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2004 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2008#define _mm512_cvtt_roundph_epi32(A, R) \
2009 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2010 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2013#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2014 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2015 (__mmask16)(U), (int)(R)))
2017#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2018 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2019 (__v16si)_mm512_setzero_epi32(), \
2020 (__mmask16)(U), (int)(R)))
2023_mm512_cvttph_epi32(__m256h __A) {
2024 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2030_mm512_mask_cvttph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2036_mm512_maskz_cvttph_epi32(
__mmask16 __U, __m256h __A) {
2037 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2042#define _mm512_cvtt_roundph_epu32(A, R) \
2043 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2044 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2047#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2048 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2049 (__mmask16)(U), (int)(R)))
2051#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2052 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2053 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2057_mm512_cvttph_epu32(__m256h __A) {
2058 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2064_mm512_mask_cvttph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2070_mm512_maskz_cvttph_epu32(
__mmask16 __U, __m256h __A) {
2071 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2076#define _mm512_cvt_roundepi64_ph(A, R) \
2077 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2078 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2080#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2081 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2082 (__mmask8)(U), (int)(R)))
2084#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2085 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2086 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2089_mm512_cvtepi64_ph(__m512i __A) {
2090 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2096_mm512_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2102_mm512_maskz_cvtepi64_ph(
__mmask8 __U, __m512i __A) {
2103 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2104 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2108#define _mm512_cvt_roundph_epi64(A, R) \
2109 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2110 (__v8di)_mm512_undefined_epi32(), \
2111 (__mmask8)(-1), (int)(R)))
2113#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2114 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2115 (__mmask8)(U), (int)(R)))
2117#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2118 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2119 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2122_mm512_cvtph_epi64(__m128h __A) {
2123 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2129_mm512_mask_cvtph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2135_mm512_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
2136 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2141#define _mm512_cvt_roundepu64_ph(A, R) \
2142 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2143 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2145#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2146 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2147 (__mmask8)(U), (int)(R)))
2149#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2150 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2151 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2154_mm512_cvtepu64_ph(__m512i __A) {
2155 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2161_mm512_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2167_mm512_maskz_cvtepu64_ph(
__mmask8 __U, __m512i __A) {
2168 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2169 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2173#define _mm512_cvt_roundph_epu64(A, R) \
2174 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2175 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2178#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2179 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2180 (__mmask8)(U), (int)(R)))
2182#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2183 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2184 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2187_mm512_cvtph_epu64(__m128h __A) {
2188 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2194_mm512_mask_cvtph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2200_mm512_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
2201 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2206#define _mm512_cvtt_roundph_epi64(A, R) \
2207 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2208 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2211#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2212 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2213 (__mmask8)(U), (int)(R)))
2215#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2216 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2217 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2220_mm512_cvttph_epi64(__m128h __A) {
2221 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2227_mm512_mask_cvttph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2233_mm512_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
2234 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2239#define _mm512_cvtt_roundph_epu64(A, R) \
2240 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2241 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2244#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2245 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2246 (__mmask8)(U), (int)(R)))
2248#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2249 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2250 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2253_mm512_cvttph_epu64(__m128h __A) {
2254 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2260_mm512_mask_cvttph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2266_mm512_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
2267 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2272#define _mm_cvt_roundsh_i32(A, R) \
2273 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2279#define _mm_cvt_roundsh_u32(A, R) \
2280 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2283_mm_cvtsh_u32(__m128h __A) {
2284 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2289#define _mm_cvt_roundsh_i64(A, R) \
2290 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2293 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2297#define _mm_cvt_roundsh_u64(A, R) \
2298 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2301_mm_cvtsh_u64(__m128h __A) {
2302 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2307#define _mm_cvt_roundu32_sh(A, B, R) \
2308 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2311_mm_cvtu32_sh(__m128h __A,
unsigned int __B) {
2317#define _mm_cvt_roundu64_sh(A, B, R) \
2318 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2322_mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2328#define _mm_cvt_roundi32_sh(A, B, R) \
2329 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2338#define _mm_cvt_roundi64_sh(A, B, R) \
2339 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2348#define _mm_cvtt_roundsh_i32(A, R) \
2349 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2352 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2357#define _mm_cvtt_roundsh_i64(A, R) \
2358 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2361 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2366#define _mm_cvtt_roundsh_u32(A, R) \
2367 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2370_mm_cvttsh_u32(__m128h __A) {
2371 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2376#define _mm_cvtt_roundsh_u64(A, R) \
2377 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2380_mm_cvttsh_u64(__m128h __A) {
2381 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2386#define _mm512_cvtx_roundph_ps(A, R) \
2387 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2388 (__v16sf)_mm512_undefined_ps(), \
2389 (__mmask16)(-1), (int)(R)))
2391#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2392 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2393 (__mmask16)(U), (int)(R)))
2395#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2396 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2397 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2400 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2406_mm512_mask_cvtxph_ps(__m512 __W,
__mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2412_mm512_maskz_cvtxph_ps(
__mmask16 __U, __m256h __A) {
2413 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2418#define _mm512_cvtx_roundps_ph(A, R) \
2419 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2420 (__v16hf)_mm256_undefined_ph(), \
2421 (__mmask16)(-1), (int)(R)))
2423#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2424 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2425 (__mmask16)(U), (int)(R)))
2427#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2428 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2429 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2432 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2438_mm512_mask_cvtxps_ph(__m256h __W,
__mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2444_mm512_maskz_cvtxps_ph(
__mmask16 __U, __m512 __A) {
2445 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2446 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2450#define _mm512_fmadd_round_ph(A, B, C, R) \
2451 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2452 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2453 (__mmask32)-1, (int)(R)))
2455#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2456 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2457 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2458 (__mmask32)(U), (int)(R)))
2460#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2461 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2462 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2463 (__mmask32)(U), (int)(R)))
2465#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2466 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2467 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2468 (__mmask32)(U), (int)(R)))
2470#define _mm512_fmsub_round_ph(A, B, C, R) \
2471 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2472 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2473 (__mmask32)-1, (int)(R)))
2475#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2476 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2477 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2478 (__mmask32)(U), (int)(R)))
2480#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2481 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2482 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2483 (__mmask32)(U), (int)(R)))
2485#define _mm512_fnmadd_round_ph(A, B, C, R) \
2486 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2487 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2488 (__mmask32)-1, (int)(R)))
2490#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2491 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2492 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2493 (__mmask32)(U), (int)(R)))
2495#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2496 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2497 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2498 (__mmask32)(U), (int)(R)))
2500#define _mm512_fnmsub_round_ph(A, B, C, R) \
2501 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2502 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2503 (__mmask32)-1, (int)(R)))
2505#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2506 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2507 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2508 (__mmask32)(U), (int)(R)))
2513 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2519_mm512_mask_fmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2520 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2526_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2527 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2533_mm512_maskz_fmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2534 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2542 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2548_mm512_mask_fmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2549 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2555_mm512_maskz_fmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2556 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2557 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2564 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2570_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2571 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2577_mm512_maskz_fnmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2578 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2586 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2592_mm512_maskz_fnmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2593 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2594 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2598#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2599 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2600 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2601 (__mmask32)-1, (int)(R)))
2603#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2604 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2605 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2606 (__mmask32)(U), (int)(R)))
2608#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2609 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2610 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2611 (__mmask32)(U), (int)(R)))
2613#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2614 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2615 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2616 (__mmask32)(U), (int)(R)))
2618#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2619 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2620 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2621 (__mmask32)-1, (int)(R)))
2623#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2624 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2625 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2626 (__mmask32)(U), (int)(R)))
2628#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2629 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2630 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2631 (__mmask32)(U), (int)(R)))
2634_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2635 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2641_mm512_mask_fmaddsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2642 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2643 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2648_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2649 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2650 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2655_mm512_maskz_fmaddsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2656 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2657 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2662_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2663 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2669_mm512_mask_fmsubadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2670 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2671 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2676_mm512_maskz_fmsubadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2677 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2678 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2682#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2683 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2684 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2685 (__mmask32)(U), (int)(R)))
2688_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2689 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2694#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2695 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2696 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2697 (__mmask32)(U), (int)(R)))
2700_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2701 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2702 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2706#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2707 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2708 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2709 (__mmask32)(U), (int)(R)))
2712_mm512_mask_fnmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2713 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2718#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2719 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2720 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2721 (__mmask32)(U), (int)(R)))
2723#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2724 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2725 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2726 (__mmask32)(U), (int)(R)))
2729_mm512_mask_fnmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2730 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2736_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2737 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2745 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2753 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2757#define _mm_fmadd_round_sh(A, B, C, R) \
2758 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2760 (__mmask8)-1, (int)(R)))
2762#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2763 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2764 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2765 (__mmask8)(U), (int)(R)))
2768_mm_maskz_fmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2769 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2774#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2775 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2776 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2777 (__mmask8)(U), (int)(R)))
2780_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2781 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2786#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2787 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2788 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2789 (__mmask8)(U), (int)(R)))
2794 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2803 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2808#define _mm_fmsub_round_sh(A, B, C, R) \
2809 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2811 (__mmask8)-1, (int)(R)))
2813#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2814 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2815 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2816 (__mmask8)(U), (int)(R)))
2819_mm_maskz_fmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2820 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2825#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2826 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2827 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2828 (__mmask8)(U), (int)R))
2831_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2832 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2837#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2838 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2839 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2840 (__mmask8)(U), (int)(R)))
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2850_mm_mask_fnmadd_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2851 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2855#define _mm_fnmadd_round_sh(A, B, C, R) \
2856 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2857 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2858 (__mmask8)-1, (int)(R)))
2860#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2861 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2862 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2863 (__mmask8)(U), (int)(R)))
2866_mm_maskz_fnmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2867 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2872#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2873 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2874 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2875 (__mmask8)(U), (int)(R)))
2878_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2879 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2884#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2885 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2886 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2887 (__mmask8)(U), (int)(R)))
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2897_mm_mask_fnmsub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2898 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2902#define _mm_fnmsub_round_sh(A, B, C, R) \
2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2904 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2905 (__mmask8)-1, (int)(R)))
2907#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2908 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2909 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2910 (__mmask8)(U), (int)(R)))
2913_mm_maskz_fnmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2914 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2919#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2920 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2921 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2922 (__mmask8)(U), (int)(R)))
2925_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2926 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2931#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2932 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2933 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2934 (__mmask8)(U), (int)(R)))
2939 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2945_mm_mask_fcmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2951_mm_maskz_fcmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2952 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2958_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
2959 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2963#define _mm_fcmadd_round_sch(A, B, C, R) \
2964 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2965 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2966 (__mmask8)-1, (int)(R)))
2968#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2969 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2970 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2971 (__mmask8)(U), (int)(R)))
2973#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2974 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2975 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2976 (__mmask8)(U), (int)(R)))
2978#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2979 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2980 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2981 (__mmask8)(U), (int)(R)))
2986 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2992_mm_mask_fmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2998_mm_maskz_fmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2999 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3005_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
3006 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3010#define _mm_fmadd_round_sch(A, B, C, R) \
3011 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3012 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3013 (__mmask8)-1, (int)(R)))
3015#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3016 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3017 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3018 (__mmask8)(U), (int)(R)))
3020#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3021 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3022 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3023 (__mmask8)(U), (int)(R)))
3025#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3026 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3027 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3028 (__mmask8)(U), (int)(R)))
3032 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3033 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3038_mm_mask_fcmul_sch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
3039 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3045_mm_maskz_fcmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3046 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3047 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3051#define _mm_fcmul_round_sch(A, B, R) \
3052 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3053 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3054 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3056#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3057 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3058 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3059 (__mmask8)(U), (int)(R)))
3061#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3062 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3063 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3064 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3068 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3069 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3077 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3083_mm_maskz_fmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3084 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3085 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3089#define _mm_fmul_round_sch(A, B, R) \
3090 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3091 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3092 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3094#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3095 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3096 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3097 (__mmask8)(U), (int)(R)))
3099#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3100 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3101 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3102 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3106 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3107 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3112_mm512_mask_fcmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3113 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3119_mm512_maskz_fcmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3120 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3121 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3125#define _mm512_fcmul_round_pch(A, B, R) \
3126 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3127 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3128 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3130#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3131 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3132 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3133 (__mmask16)(U), (int)(R)))
3135#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3136 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3137 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3138 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3142 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3143 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3148_mm512_mask_fmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3149 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3155_mm512_maskz_fmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3156 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3157 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3161#define _mm512_fmul_round_pch(A, B, R) \
3162 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3163 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3164 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3166#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3167 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3168 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3169 (__mmask16)(U), (int)(R)))
3171#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3172 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3173 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3174 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3179 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3180 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3185_mm512_mask_fcmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3186 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3187 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3192_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3193 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3194 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3199_mm512_maskz_fcmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3200 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3201 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3205#define _mm512_fcmadd_round_pch(A, B, C, R) \
3206 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3207 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3208 (__mmask16)-1, (int)(R)))
3210#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3211 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3212 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3213 (__mmask16)(U), (int)(R)))
3215#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3216 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3217 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3218 (__mmask16)(U), (int)(R)))
3220#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3221 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3222 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3223 (__mmask16)(U), (int)(R)))
3228 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3234_mm512_mask_fmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3235 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3241_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3242 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3243 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3248_mm512_maskz_fmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3249 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3250 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3254#define _mm512_fmadd_round_pch(A, B, C, R) \
3255 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3256 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3257 (__mmask16)-1, (int)(R)))
3259#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3260 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3261 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3262 (__mmask16)(U), (int)(R)))
3264#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3265 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3266 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3267 (__mmask16)(U), (int)(R)))
3269#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3270 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3271 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3272 (__mmask16)(U), (int)(R)))
3275_mm512_reduce_add_ph(__m512h __W) {
3276 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3280_mm512_reduce_mul_ph(__m512h __W) {
3281 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3285_mm512_reduce_max_ph(__m512h __V) {
3286 return __builtin_ia32_reduce_fmax_ph512(__V);
3290_mm512_reduce_min_ph(__m512h __V) {
3291 return __builtin_ia32_reduce_fmin_ph512(__V);
3295_mm512_mask_blend_ph(
__mmask32 __U, __m512h __A, __m512h __W) {
3296 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3301_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3302 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3307_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3308 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3312#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3313#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3314#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3315#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3316#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3317 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3318#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3319 _mm512_maskz_fmul_round_pch(U, A, B, R)
3321#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3322#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3323#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3324#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3325#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3326 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3327#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3328 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3330#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3331#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3332#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3333#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3334#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3335 _mm_mask_fmul_round_sch(W, U, A, B, R)
3336#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3338#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3339#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3340#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3341#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3342#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3343 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3344#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3345 _mm_maskz_fcmul_round_sch(U, A, B, R)
3347#undef __DEFAULT_FN_ATTRS128
3348#undef __DEFAULT_FN_ATTRS256
3349#undef __DEFAULT_FN_ATTRS512
3350#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3351#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3352#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
#define _MM_FROUND_CUR_DIRECTION
static __inline__ void int __a
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.