10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
45_mm512_cvtsh_h(__m512h
__a) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
54_mm256_setzero_ph(
void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
60 return (__m256h)__builtin_ia32_undef256();
64_mm512_setzero_ph(
void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
71 return (__m128h)__builtin_ia32_undef128();
75 return (__m512h)__builtin_ia32_undef512();
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
116_mm512_set1_pch(
_Float16 _Complex __h) {
149_mm256_castph_si256(__m256h
__a) {
154_mm512_castph_si512(__m512h
__a) {
187_mm256_castsi256_ph(__m256i
__a) {
192_mm512_castsi512_ph(__m512i
__a) {
197_mm256_castph256_ph128(__m256h
__a) {
198 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
202_mm512_castph512_ph128(__m512h
__a) {
203 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
207_mm512_castph512_ph256(__m512h
__a) {
208 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
213_mm256_castph128_ph256(__m128h
__a) {
214 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
219_mm512_castph128_ph512(__m128h
__a) {
220 __m256h
__b = __builtin_nondeterministic_value(
__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
229_mm512_castph256_ph512(__m256h
__a) {
230 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
250_mm256_zextph128_ph256(__m128h
__a) {
251 return __builtin_shufflevector(
__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
269_mm512_zextph128_ph512(__m128h
__a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
289_mm512_zextph256_ph512(__m256h
__a) {
290 return __builtin_shufflevector(
__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OS,
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OS,
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OS,
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OS,
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OS,
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_US,
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OQ,
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OQ,
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OQ,
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OQ,
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OQ,
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_UQ,
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
380_mm512_mask_add_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (
__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
386_mm512_maskz_add_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
412_mm512_mask_sub_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (
__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
418_mm512_maskz_sub_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
444_mm512_mask_mul_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (
__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
450_mm512_maskz_mul_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
476_mm512_mask_div_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (
__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
482_mm512_maskz_div_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
502static __inline__ __m512h
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
509_mm512_mask_min_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (
__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
515_mm512_maskz_min_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
535static __inline__ __m512h
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
542_mm512_mask_max_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (
__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
548_mm512_maskz_max_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
569_mm512_abs_ph(__m512h __A) {
579_mm512_mask_conj_pch(__m512h __W,
__mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (
__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
585_mm512_maskz_conj_pch(
__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
592_mm_add_sh(__m128h __A, __m128h __B) {
598_mm_mask_add_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
599 __A = _mm_add_sh(__A, __B);
600 return __builtin_ia32_selectsh_128(__U, __A, __W);
604_mm_maskz_add_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
605 __A = _mm_add_sh(__A, __B);
606 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
609#define _mm_add_round_sh(A, B, R) \
610 ((__m128h)__builtin_ia32_addsh_round_mask( \
611 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
612 (__mmask8)-1, (int)(R)))
614#define _mm_mask_add_round_sh(W, U, A, B, R) \
615 ((__m128h)__builtin_ia32_addsh_round_mask( \
616 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
617 (__mmask8)(U), (int)(R)))
619#define _mm_maskz_add_round_sh(U, A, B, R) \
620 ((__m128h)__builtin_ia32_addsh_round_mask( \
621 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
622 (__mmask8)(U), (int)(R)))
624static __inline__ __m128h
631_mm_mask_sub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
632 __A = _mm_sub_sh(__A, __B);
633 return __builtin_ia32_selectsh_128(__U, __A, __W);
637_mm_maskz_sub_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
638 __A = _mm_sub_sh(__A, __B);
639 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
642#define _mm_sub_round_sh(A, B, R) \
643 ((__m128h)__builtin_ia32_subsh_round_mask( \
644 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
645 (__mmask8)-1, (int)(R)))
647#define _mm_mask_sub_round_sh(W, U, A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
650 (__mmask8)(U), (int)(R)))
652#define _mm_maskz_sub_round_sh(U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
655 (__mmask8)(U), (int)(R)))
657static __inline__ __m128h
664_mm_mask_mul_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
665 __A = _mm_mul_sh(__A, __B);
666 return __builtin_ia32_selectsh_128(__U, __A, __W);
670_mm_maskz_mul_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
671 __A = _mm_mul_sh(__A, __B);
672 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
675#define _mm_mul_round_sh(A, B, R) \
676 ((__m128h)__builtin_ia32_mulsh_round_mask( \
677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678 (__mmask8)-1, (int)(R)))
680#define _mm_mask_mul_round_sh(W, U, A, B, R) \
681 ((__m128h)__builtin_ia32_mulsh_round_mask( \
682 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
683 (__mmask8)(U), (int)(R)))
685#define _mm_maskz_mul_round_sh(U, A, B, R) \
686 ((__m128h)__builtin_ia32_mulsh_round_mask( \
687 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
688 (__mmask8)(U), (int)(R)))
690static __inline__ __m128h
697_mm_mask_div_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
698 __A = _mm_div_sh(__A, __B);
699 return __builtin_ia32_selectsh_128(__U, __A, __W);
703_mm_maskz_div_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
704 __A = _mm_div_sh(__A, __B);
705 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
708#define _mm_div_round_sh(A, B, R) \
709 ((__m128h)__builtin_ia32_divsh_round_mask( \
710 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
711 (__mmask8)-1, (int)(R)))
713#define _mm_mask_div_round_sh(W, U, A, B, R) \
714 ((__m128h)__builtin_ia32_divsh_round_mask( \
715 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
716 (__mmask8)(U), (int)(R)))
718#define _mm_maskz_div_round_sh(U, A, B, R) \
719 ((__m128h)__builtin_ia32_divsh_round_mask( \
720 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
721 (__mmask8)(U), (int)(R)))
723static __inline__ __m128h
725 return (__m128h)__builtin_ia32_minsh_round_mask(
726 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
731_mm_mask_min_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
732 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
738_mm_maskz_min_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
739 return (__m128h)__builtin_ia32_minsh_round_mask(
740 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
744#define _mm_min_round_sh(A, B, R) \
745 ((__m128h)__builtin_ia32_minsh_round_mask( \
746 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
747 (__mmask8)-1, (int)(R)))
749#define _mm_mask_min_round_sh(W, U, A, B, R) \
750 ((__m128h)__builtin_ia32_minsh_round_mask( \
751 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
752 (__mmask8)(U), (int)(R)))
754#define _mm_maskz_min_round_sh(U, A, B, R) \
755 ((__m128h)__builtin_ia32_minsh_round_mask( \
756 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
757 (__mmask8)(U), (int)(R)))
759static __inline__ __m128h
761 return (__m128h)__builtin_ia32_maxsh_round_mask(
762 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
767_mm_mask_max_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
768 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
774_mm_maskz_max_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
775 return (__m128h)__builtin_ia32_maxsh_round_mask(
776 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
780#define _mm_max_round_sh(A, B, R) \
781 ((__m128h)__builtin_ia32_maxsh_round_mask( \
782 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
783 (__mmask8)-1, (int)(R)))
785#define _mm_mask_max_round_sh(W, U, A, B, R) \
786 ((__m128h)__builtin_ia32_maxsh_round_mask( \
787 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
788 (__mmask8)(U), (int)(R)))
790#define _mm_maskz_max_round_sh(U, A, B, R) \
791 ((__m128h)__builtin_ia32_maxsh_round_mask( \
792 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
793 (__mmask8)(U), (int)(R)))
795#define _mm512_cmp_round_ph_mask(A, B, P, R) \
796 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
797 (__v32hf)(__m512h)(B), (int)(P), \
798 (__mmask32)-1, (int)(R)))
800#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
801 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
802 (__v32hf)(__m512h)(B), (int)(P), \
803 (__mmask32)(U), (int)(R)))
805#define _mm512_cmp_ph_mask(A, B, P) \
806 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
808#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
809 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
811#define _mm_cmp_round_sh_mask(X, Y, P, R) \
812 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
813 (__v8hf)(__m128h)(Y), (int)(P), \
814 (__mmask8)-1, (int)(R)))
816#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
817 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
818 (__v8hf)(__m128h)(Y), (int)(P), \
819 (__mmask8)(M), (int)(R)))
821#define _mm_cmp_sh_mask(X, Y, P) \
822 ((__mmask8)__builtin_ia32_cmpsh_mask( \
823 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
824 _MM_FROUND_CUR_DIRECTION))
826#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
827 ((__mmask8)__builtin_ia32_cmpsh_mask( \
828 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
829 _MM_FROUND_CUR_DIRECTION))
832 struct __mm_load_sh_struct {
835 _Float16 __u = ((
const struct __mm_load_sh_struct *)__dp)->__u;
836 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
840_mm_mask_load_sh(__m128h __W,
__mmask8 __U,
const void *__A) {
841 __m128h src = (__v8hf)__builtin_shufflevector(
842 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
844 return (__m128h)__builtin_ia32_loadsh128_mask((
const __v8hf *)__A, src, __U & 1);
848_mm_maskz_load_sh(
__mmask8 __U,
const void *__A) {
849 return (__m128h)__builtin_ia32_loadsh128_mask(
850 (
const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
854_mm512_load_ph(
void const *
__p) {
855 return *(
const __m512h *)
__p;
859_mm256_load_ph(
void const *
__p) {
860 return *(
const __m256h *)
__p;
864 return *(
const __m128h *)
__p;
868_mm512_loadu_ph(
void const *
__p) {
872 return ((
const struct __loadu_ph *)
__p)->__v;
876_mm256_loadu_ph(
void const *
__p) {
880 return ((
const struct __loadu_ph *)
__p)->__v;
887 return ((
const struct __loadu_ph *)
__p)->__v;
893 struct __mm_store_sh_struct {
896 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
902 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
907 *(__m512h *)
__P = __A;
912 *(__m256h *)
__P = __A;
917 *(__m128h *)
__P = __A;
925 ((
struct __storeu_ph *)
__P)->
__v = __A;
933 ((
struct __storeu_ph *)
__P)->
__v = __A;
941 ((
struct __storeu_ph *)
__P)->
__v = __A;
946_mm_move_sh(__m128h
__a, __m128h
__b) {
952_mm_mask_move_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
953 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
957_mm_maskz_move_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
958 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
964 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
973 return (__m512h)__builtin_ia32_rcpph512_mask(
974 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
978_mm512_mask_rcp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
979 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
984_mm512_maskz_rcp_ph(
__mmask32 __U, __m512h __A) {
985 return (__m512h)__builtin_ia32_rcpph512_mask(
986 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
990 return (__m512h)__builtin_ia32_rsqrtph512_mask(
991 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
995_mm512_mask_rsqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
996 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1001_mm512_maskz_rsqrt_ph(
__mmask32 __U, __m512h __A) {
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1003 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1006#define _mm512_getmant_ph(A, B, C) \
1007 ((__m512h)__builtin_ia32_getmantph512_mask( \
1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1010 _MM_FROUND_CUR_DIRECTION))
1012#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017#define _mm512_maskz_getmant_ph(U, A, B, C) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1022#define _mm512_getmant_round_ph(A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1027#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1030 (__mmask32)(U), (int)(R)))
1032#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1038 return (__m512h)__builtin_ia32_getexpph512_mask(
1039 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1044_mm512_mask_getexp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1045 return (__m512h)__builtin_ia32_getexpph512_mask(
1050_mm512_maskz_getexp_ph(
__mmask32 __U, __m512h __A) {
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1052 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1056#define _mm512_getexp_round_ph(A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1058 (__v32hf)_mm512_undefined_ph(), \
1059 (__mmask32)-1, (int)(R)))
1061#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask( \
1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1065#define _mm512_maskz_getexp_round_ph(U, A, R) \
1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1067 (__v32hf)_mm512_setzero_ph(), \
1068 (__mmask32)(U), (int)(R)))
1072 return (__m512h)__builtin_ia32_scalefph512_mask(
1073 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1078_mm512_mask_scalef_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
1079 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1085_mm512_maskz_scalef_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
1086 return (__m512h)__builtin_ia32_scalefph512_mask(
1087 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1091#define _mm512_scalef_round_ph(A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1096#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1099 (__mmask32)(U), (int)(R)))
1101#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1102 ((__m512h)__builtin_ia32_scalefph512_mask( \
1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1106#define _mm512_roundscale_ph(A, B) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1109 _MM_FROUND_CUR_DIRECTION))
1111#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1116#define _mm512_maskz_roundscale_ph(A, B, imm) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1121#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1123 (__v32hf)(__m512h)(A), \
1124 (__mmask32)(B), (int)(R)))
1126#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1128 (__v32hf)_mm512_setzero_ph(), \
1129 (__mmask32)(A), (int)(R)))
1131#define _mm512_roundscale_round_ph(A, imm, R) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1133 (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, (int)(R)))
1136#define _mm512_reduce_ph(A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1141#define _mm512_mask_reduce_ph(W, U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146#define _mm512_maskz_reduce_ph(U, A, imm) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask( \
1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1151#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)(__m512h)(W), \
1154 (__mmask32)(U), (int)(R)))
1156#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_setzero_ph(), \
1159 (__mmask32)(U), (int)(R)))
1161#define _mm512_reduce_round_ph(A, imm, R) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163 (__v32hf)_mm512_undefined_ph(), \
1164 (__mmask32)-1, (int)(R)))
1168 return (__m128h)__builtin_ia32_rcpsh_mask(
1169 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1176 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1183 return (__m128h)__builtin_ia32_rcpsh_mask(
1184 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1189 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1197 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1202_mm_maskz_rsqrt_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1207#define _mm_getmant_round_sh(A, B, C, D, R) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1212#define _mm_getmant_sh(A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1217#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1222#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1227#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1232#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1237#define _mm_getexp_round_sh(A, B, R) \
1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1240 (__mmask8)-1, (int)(R)))
1244 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1245 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1250_mm_mask_getexp_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1251 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1252 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1256#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1259 (__mmask8)(U), (int)(R)))
1262_mm_maskz_getexp_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1263 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1264 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1268#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)(U), (int)(R)))
1273#define _mm_scalef_round_sh(A, B, R) \
1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1276 (__mmask8)-1, (int)(R)))
1280 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1281 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1286_mm_mask_scalef_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1287 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1292#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1295 (__mmask8)(U), (int)(R)))
1298_mm_maskz_scalef_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1299 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1300 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1304#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)(U), (int)(R)))
1309#define _mm_roundscale_round_sh(A, B, imm, R) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), (int)(R)))
1314#define _mm_roundscale_sh(A, B, imm) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1319#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1324#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1327 (__mmask8)(U), (int)(I), (int)(R)))
1329#define _mm_maskz_roundscale_sh(U, A, B, I) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1334#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)(U), (int)(I), (int)(R)))
1339#define _mm_reduce_sh(A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1344#define _mm_mask_reduce_sh(W, U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349#define _mm_maskz_reduce_sh(U, A, B, C) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1354#define _mm_reduce_round_sh(A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)-1, (int)(C), (int)(R)))
1359#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1364#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1367 (__mmask8)(U), (int)(C), (int)(R)))
1369#define _mm512_sqrt_round_ph(A, R) \
1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1372#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)(__m512h)(W)))
1377#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1378 ((__m512h)__builtin_ia32_selectph_512( \
1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1380 (__v32hf)_mm512_setzero_ph()))
1383 return (__m512h)__builtin_elementwise_sqrt((__v32hf)__A);
1387_mm512_mask_sqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1388 return (__m512h)__builtin_ia32_selectph_512(
1389 (
__mmask32)(__U), (__v32hf)_mm512_sqrt_ph(__A), (__v32hf)(__m512h)(__W));
1393_mm512_maskz_sqrt_ph(
__mmask32 __U, __m512h __A) {
1394 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)(__U),
1395 (__v32hf)_mm512_sqrt_ph(__A),
1396 (__v32hf)_mm512_setzero_ph());
1399#define _mm_sqrt_round_sh(A, B, R) \
1400 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1401 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1402 (__mmask8)-1, (int)(R)))
1404#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1405 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1406 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1407 (__mmask8)(U), (int)(R)))
1409#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1410 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1411 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1412 (__mmask8)(U), (int)(R)))
1416 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1433 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1438#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1439 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1440 (int)(imm), (__mmask32)(U)))
1442#define _mm512_fpclass_ph_mask(A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)-1))
1446#define _mm_fpclass_sh_mask(A, imm) \
1447 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1450#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454#define _mm512_cvt_roundpd_ph(A, R) \
1455 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1456 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1458#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1460 (__mmask8)(U), (int)(R)))
1462#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1464 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1467 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1473_mm512_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m512d __A) {
1474 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1479_mm512_maskz_cvtpd_ph(
__mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1485#define _mm512_cvt_roundph_pd(A, R) \
1486 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1487 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1489#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1491 (__mmask8)(U), (int)(R)))
1493#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1495 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1498 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1504_mm512_mask_cvtph_pd(__m512d __W,
__mmask8 __U, __m128h __A) {
1505 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1510_mm512_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1516#define _mm_cvt_roundsh_ss(A, B, R) \
1517 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1518 (__v4sf)_mm_undefined_ps(), \
1519 (__mmask8)(-1), (int)(R)))
1521#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1522 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1523 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1525#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1527 (__v4sf)_mm_setzero_ps(), \
1528 (__mmask8)(U), (int)(R)))
1532 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1541 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1549 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1554#define _mm_cvt_roundss_sh(A, B, R) \
1555 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1556 (__v8hf)_mm_undefined_ph(), \
1557 (__mmask8)(-1), (int)(R)))
1559#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1560 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1561 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1563#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1565 (__v8hf)_mm_setzero_ph(), \
1566 (__mmask8)(U), (int)(R)))
1570 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1579 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1587 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1592#define _mm_cvt_roundsd_sh(A, B, R) \
1593 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1594 (__v8hf)_mm_undefined_ph(), \
1595 (__mmask8)(-1), (int)(R)))
1597#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1598 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1599 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1601#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1603 (__v8hf)_mm_setzero_ph(), \
1604 (__mmask8)(U), (int)(R)))
1608 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1617 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1623_mm_maskz_cvtsd_sh(
__mmask8 __U, __m128h __A, __m128d __B) {
1624 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1629#define _mm_cvt_roundsh_sd(A, B, R) \
1630 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1631 (__v2df)_mm_undefined_pd(), \
1632 (__mmask8)(-1), (int)(R)))
1634#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1635 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1636 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1638#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1640 (__v2df)_mm_setzero_pd(), \
1641 (__mmask8)(U), (int)(R)))
1645 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1654 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1660_mm_maskz_cvtsh_sd(
__mmask8 __U, __m128d __A, __m128h __B) {
1661 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1666#define _mm512_cvt_roundph_epi16(A, R) \
1667 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1668 (__v32hi)_mm512_undefined_epi32(), \
1669 (__mmask32)(-1), (int)(R)))
1671#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1672 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1673 (__mmask32)(U), (int)(R)))
1675#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1677 (__v32hi)_mm512_setzero_epi32(), \
1678 (__mmask32)(U), (int)(R)))
1681_mm512_cvtph_epi16(__m512h __A) {
1682 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1688_mm512_mask_cvtph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1689 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1694_mm512_maskz_cvtph_epi16(
__mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1700#define _mm512_cvtt_roundph_epi16(A, R) \
1701 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1702 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1705#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1706 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1707 (__mmask32)(U), (int)(R)))
1709#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1711 (__v32hi)_mm512_setzero_epi32(), \
1712 (__mmask32)(U), (int)(R)))
1715_mm512_cvttph_epi16(__m512h __A) {
1716 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1722_mm512_mask_cvttph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1723 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1728_mm512_maskz_cvttph_epi16(
__mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1734#define _mm512_cvt_roundepi16_ph(A, R) \
1735 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1736 (__v32hf)_mm512_undefined_ph(), \
1737 (__mmask32)(-1), (int)(R)))
1739#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1740 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1741 (__mmask32)(U), (int)(R)))
1743#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1745 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1748_mm512_cvtepi16_ph(__m512i __A) {
1749 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1755_mm512_mask_cvtepi16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1756 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1761_mm512_maskz_cvtepi16_ph(
__mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1767#define _mm512_cvt_roundph_epu16(A, R) \
1768 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1769 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1772#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1773 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1774 (__mmask32)(U), (int)(R)))
1776#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1778 (__v32hu)_mm512_setzero_epi32(), \
1779 (__mmask32)(U), (int)(R)))
1782_mm512_cvtph_epu16(__m512h __A) {
1783 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1789_mm512_mask_cvtph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1790 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1795_mm512_maskz_cvtph_epu16(
__mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1801#define _mm512_cvtt_roundph_epu16(A, R) \
1802 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1803 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1806#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1807 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1808 (__mmask32)(U), (int)(R)))
1810#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1812 (__v32hu)_mm512_setzero_epi32(), \
1813 (__mmask32)(U), (int)(R)))
1816_mm512_cvttph_epu16(__m512h __A) {
1817 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1823_mm512_mask_cvttph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1824 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1829_mm512_maskz_cvttph_epu16(
__mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1835#define _mm512_cvt_roundepu16_ph(A, R) \
1836 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1837 (__v32hf)_mm512_undefined_ph(), \
1838 (__mmask32)(-1), (int)(R)))
1840#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1841 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1842 (__mmask32)(U), (int)(R)))
1844#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1846 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1849_mm512_cvtepu16_ph(__m512i __A) {
1850 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1856_mm512_mask_cvtepu16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1857 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1862_mm512_maskz_cvtepu16_ph(
__mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1868#define _mm512_cvt_roundph_epi32(A, R) \
1869 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1870 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1873#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1874 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1875 (__mmask16)(U), (int)(R)))
1877#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1879 (__v16si)_mm512_setzero_epi32(), \
1880 (__mmask16)(U), (int)(R)))
1883_mm512_cvtph_epi32(__m256h __A) {
1884 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1890_mm512_mask_cvtph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
1891 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1896_mm512_maskz_cvtph_epi32(
__mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1902#define _mm512_cvt_roundph_epu32(A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1904 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1907#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1908 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1909 (__mmask16)(U), (int)(R)))
1911#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1913 (__v16su)_mm512_setzero_epi32(), \
1914 (__mmask16)(U), (int)(R)))
1917_mm512_cvtph_epu32(__m256h __A) {
1918 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1924_mm512_mask_cvtph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
1925 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1930_mm512_maskz_cvtph_epu32(
__mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1936#define _mm512_cvt_roundepi32_ph(A, R) \
1937 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1938 (__v16hf)_mm256_undefined_ph(), \
1939 (__mmask16)(-1), (int)(R)))
1941#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1942 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1943 (__mmask16)(U), (int)(R)))
1945#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1947 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1950_mm512_cvtepi32_ph(__m512i __A) {
1951 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1957_mm512_mask_cvtepi32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1958 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1963_mm512_maskz_cvtepi32_ph(
__mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
1969#define _mm512_cvt_roundepu32_ph(A, R) \
1970 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1971 (__v16hf)_mm256_undefined_ph(), \
1972 (__mmask16)(-1), (int)(R)))
1974#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1975 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1976 (__mmask16)(U), (int)(R)))
1978#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1980 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1983_mm512_cvtepu32_ph(__m512i __A) {
1984 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1990_mm512_mask_cvtepu32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1991 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1996_mm512_maskz_cvtepu32_ph(
__mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2002#define _mm512_cvtt_roundph_epi32(A, R) \
2003 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2004 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2007#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2008 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2009 (__mmask16)(U), (int)(R)))
2011#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2013 (__v16si)_mm512_setzero_epi32(), \
2014 (__mmask16)(U), (int)(R)))
2017_mm512_cvttph_epi32(__m256h __A) {
2018 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2024_mm512_mask_cvttph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
2025 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2030_mm512_maskz_cvttph_epi32(
__mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2036#define _mm512_cvtt_roundph_epu32(A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2038 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2041#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2042 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2043 (__mmask16)(U), (int)(R)))
2045#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2047 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2051_mm512_cvttph_epu32(__m256h __A) {
2052 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2058_mm512_mask_cvttph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
2059 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2064_mm512_maskz_cvttph_epu32(
__mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2070#define _mm512_cvt_roundepi64_ph(A, R) \
2071 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2072 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2074#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2076 (__mmask8)(U), (int)(R)))
2078#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2080 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2083_mm512_cvtepi64_ph(__m512i __A) {
2084 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2090_mm512_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2091 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2096_mm512_maskz_cvtepi64_ph(
__mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2102#define _mm512_cvt_roundph_epi64(A, R) \
2103 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2104 (__v8di)_mm512_undefined_epi32(), \
2105 (__mmask8)(-1), (int)(R)))
2107#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2108 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2109 (__mmask8)(U), (int)(R)))
2111#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2113 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2116_mm512_cvtph_epi64(__m128h __A) {
2117 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2123_mm512_mask_cvtph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2124 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2129_mm512_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2135#define _mm512_cvt_roundepu64_ph(A, R) \
2136 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2137 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2139#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2141 (__mmask8)(U), (int)(R)))
2143#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2145 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2148_mm512_cvtepu64_ph(__m512i __A) {
2149 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2155_mm512_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2156 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2161_mm512_maskz_cvtepu64_ph(
__mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2167#define _mm512_cvt_roundph_epu64(A, R) \
2168 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2169 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2172#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2173 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2174 (__mmask8)(U), (int)(R)))
2176#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2178 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2181_mm512_cvtph_epu64(__m128h __A) {
2182 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2188_mm512_mask_cvtph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2189 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2194_mm512_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2200#define _mm512_cvtt_roundph_epi64(A, R) \
2201 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2202 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2205#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2206 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2207 (__mmask8)(U), (int)(R)))
2209#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2211 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2214_mm512_cvttph_epi64(__m128h __A) {
2215 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2221_mm512_mask_cvttph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2222 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2227_mm512_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2233#define _mm512_cvtt_roundph_epu64(A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2235 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2238#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2239 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2240 (__mmask8)(U), (int)(R)))
2242#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2244 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2247_mm512_cvttph_epu64(__m128h __A) {
2248 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2254_mm512_mask_cvttph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2255 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2260_mm512_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2266#define _mm_cvt_roundsh_i32(A, R) \
2267 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2273#define _mm_cvt_roundsh_u32(A, R) \
2274 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2277_mm_cvtsh_u32(__m128h __A) {
2278 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2283#define _mm_cvt_roundsh_i64(A, R) \
2284 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2287 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2291#define _mm_cvt_roundsh_u64(A, R) \
2292 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2295_mm_cvtsh_u64(__m128h __A) {
2296 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2301#define _mm_cvt_roundu32_sh(A, B, R) \
2302 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2305_mm_cvtu32_sh(__m128h __A,
unsigned int __B) {
2311#define _mm_cvt_roundu64_sh(A, B, R) \
2312 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2316_mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2322#define _mm_cvt_roundi32_sh(A, B, R) \
2323 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2332#define _mm_cvt_roundi64_sh(A, B, R) \
2333 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2342#define _mm_cvtt_roundsh_i32(A, R) \
2343 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2346 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2351#define _mm_cvtt_roundsh_i64(A, R) \
2352 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2355 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2360#define _mm_cvtt_roundsh_u32(A, R) \
2361 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2364_mm_cvttsh_u32(__m128h __A) {
2365 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2370#define _mm_cvtt_roundsh_u64(A, R) \
2371 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2374_mm_cvttsh_u64(__m128h __A) {
2375 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2380#define _mm512_cvtx_roundph_ps(A, R) \
2381 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2382 (__v16sf)_mm512_undefined_ps(), \
2383 (__mmask16)(-1), (int)(R)))
2385#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2386 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2387 (__mmask16)(U), (int)(R)))
2389#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2391 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2394 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2400_mm512_mask_cvtxph_ps(__m512 __W,
__mmask16 __U, __m256h __A) {
2401 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2406_mm512_maskz_cvtxph_ps(
__mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2412#define _mm512_cvtx_roundps_ph(A, R) \
2413 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2414 (__v16hf)_mm256_undefined_ph(), \
2415 (__mmask16)(-1), (int)(R)))
2417#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2418 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2419 (__mmask16)(U), (int)(R)))
2421#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2423 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2426 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2432_mm512_mask_cvtxps_ph(__m256h __W,
__mmask16 __U, __m512 __A) {
2433 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2438_mm512_maskz_cvtxps_ph(
__mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2444#define _mm512_fmadd_round_ph(A, B, C, R) \
2445 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2446 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2447 (__mmask32)-1, (int)(R)))
2449#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2450 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2451 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2452 (__mmask32)(U), (int)(R)))
2454#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2455 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2456 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2457 (__mmask32)(U), (int)(R)))
2459#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2460 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2461 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2462 (__mmask32)(U), (int)(R)))
2464#define _mm512_fmsub_round_ph(A, B, C, R) \
2465 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2466 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2467 (__mmask32)-1, (int)(R)))
2469#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2472 (__mmask32)(U), (int)(R)))
2474#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2479#define _mm512_fnmadd_round_ph(A, B, C, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2481 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)-1, (int)(R)))
2484#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2486 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2489#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2491 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2492 (__mmask32)(U), (int)(R)))
2494#define _mm512_fnmsub_round_ph(A, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)-1, (int)(R)))
2499#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2507 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2513_mm512_mask_fmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2514 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2520_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2521 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2527_mm512_maskz_fmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2536 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2542_mm512_mask_fmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2543 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2549_mm512_maskz_fmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2558 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2564_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2565 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2571_mm512_maskz_fnmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2580 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2586_mm512_maskz_fnmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2592#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2593 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2594 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2595 (__mmask32)-1, (int)(R)))
2597#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2598 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2599 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2600 (__mmask32)(U), (int)(R)))
2602#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2603 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2604 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2605 (__mmask32)(U), (int)(R)))
2607#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2608 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2609 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2610 (__mmask32)(U), (int)(R)))
2612#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2613 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2614 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2615 (__mmask32)-1, (int)(R)))
2617#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2620 (__mmask32)(U), (int)(R)))
2622#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2628_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2635_mm512_mask_fmaddsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2636 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2642_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2643 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2649_mm512_maskz_fmaddsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2656_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2663_mm512_mask_fmsubadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2664 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2670_mm512_maskz_fmsubadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2676#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2677 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2678 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2679 (__mmask32)(U), (int)(R)))
2682_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2683 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2688#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2689 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2690 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2691 (__mmask32)(U), (int)(R)))
2694_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2695 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2700#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2701 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2702 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2706_mm512_mask_fnmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2707 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2712#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2713 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2714 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2717#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2718 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2719 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2720 (__mmask32)(U), (int)(R)))
2723_mm512_mask_fnmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2724 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2730_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2731 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2739 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2751#define _mm_fmadd_round_sh(A, B, C, R) \
2752 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2753 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2754 (__mmask8)-1, (int)(R)))
2756#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2757 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2758 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2759 (__mmask8)(U), (int)(R)))
2762_mm_maskz_fmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2768#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2769 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2771 (__mmask8)(U), (int)(R)))
2774_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2775 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2780#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2783 (__mmask8)(U), (int)(R)))
2788 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2797 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2802#define _mm_fmsub_round_sh(A, B, C, R) \
2803 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2805 (__mmask8)-1, (int)(R)))
2807#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2808 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2809 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2810 (__mmask8)(U), (int)(R)))
2813_mm_maskz_fmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2819#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2820 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2821 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2822 (__mmask8)(U), (int)R))
2825_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2826 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2831#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2832 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2834 (__mmask8)(U), (int)(R)))
2839 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2844_mm_mask_fnmadd_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2849#define _mm_fnmadd_round_sh(A, B, C, R) \
2850 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2851 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2852 (__mmask8)-1, (int)(R)))
2854#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2855 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2856 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2857 (__mmask8)(U), (int)(R)))
2860_mm_maskz_fnmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2866#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2867 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2868 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2869 (__mmask8)(U), (int)(R)))
2872_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2873 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2878#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2881 (__mmask8)(U), (int)(R)))
2886 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2891_mm_mask_fnmsub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2896#define _mm_fnmsub_round_sh(A, B, C, R) \
2897 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2898 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2899 (__mmask8)-1, (int)(R)))
2901#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2902 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2903 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2904 (__mmask8)(U), (int)(R)))
2907_mm_maskz_fnmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2913#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2914 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2915 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2916 (__mmask8)(U), (int)(R)))
2919_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2920 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2925#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2926 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2928 (__mmask8)(U), (int)(R)))
2933 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2939_mm_mask_fcmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2940 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2945_mm_maskz_fcmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2952_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
2953 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2957#define _mm_fcmadd_round_sch(A, B, C, R) \
2958 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2959 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2960 (__mmask8)-1, (int)(R)))
2962#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2963 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2964 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2965 (__mmask8)(U), (int)(R)))
2967#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2968 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2969 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2970 (__mmask8)(U), (int)(R)))
2972#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2973 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2974 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2975 (__mmask8)(U), (int)(R)))
2980 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2986_mm_mask_fmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2987 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2992_mm_maskz_fmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2999_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
3000 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3004#define _mm_fmadd_round_sch(A, B, C, R) \
3005 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3006 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3007 (__mmask8)-1, (int)(R)))
3009#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3010 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3011 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3012 (__mmask8)(U), (int)(R)))
3014#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3015 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3016 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3017 (__mmask8)(U), (int)(R)))
3019#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3020 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3021 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3022 (__mmask8)(U), (int)(R)))
3026 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3032_mm_mask_fcmul_sch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
3033 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3039_mm_maskz_fcmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3040 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3045#define _mm_fcmul_round_sch(A, B, R) \
3046 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3047 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3048 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3050#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3051 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3052 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3053 (__mmask8)(U), (int)(R)))
3055#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3056 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3057 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3058 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3062 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3071 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3077_mm_maskz_fmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3078 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3083#define _mm_fmul_round_sch(A, B, R) \
3084 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3085 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3086 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3088#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3089 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3090 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3091 (__mmask8)(U), (int)(R)))
3093#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3094 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3095 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3096 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3100 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3106_mm512_mask_fcmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3107 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3113_mm512_maskz_fcmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3114 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3119#define _mm512_fcmul_round_pch(A, B, R) \
3120 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3121 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3122 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3124#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3125 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3126 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3127 (__mmask16)(U), (int)(R)))
3129#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3130 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3131 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3132 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3136 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3142_mm512_mask_fmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3143 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3149_mm512_maskz_fmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3150 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3155#define _mm512_fmul_round_pch(A, B, R) \
3156 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3157 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3158 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3160#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3161 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3162 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3163 (__mmask16)(U), (int)(R)))
3165#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3166 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3167 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3168 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3173 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3179_mm512_mask_fcmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3180 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3186_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3187 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3193_mm512_maskz_fcmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3199#define _mm512_fcmadd_round_pch(A, B, C, R) \
3200 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3201 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3202 (__mmask16)-1, (int)(R)))
3204#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3205 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3206 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3207 (__mmask16)(U), (int)(R)))
3209#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3210 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3211 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3212 (__mmask16)(U), (int)(R)))
3214#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3215 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3216 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3217 (__mmask16)(U), (int)(R)))
3222 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3228_mm512_mask_fmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3229 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3235_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3236 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3242_mm512_maskz_fmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3248#define _mm512_fmadd_round_pch(A, B, C, R) \
3249 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3250 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3251 (__mmask16)-1, (int)(R)))
3253#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3254 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3255 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3256 (__mmask16)(U), (int)(R)))
3258#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3259 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3260 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3261 (__mmask16)(U), (int)(R)))
3263#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3264 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3265 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3266 (__mmask16)(U), (int)(R)))
3269_mm512_reduce_add_ph(__m512h __W) {
3270 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3274_mm512_reduce_mul_ph(__m512h __W) {
3275 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3279_mm512_reduce_max_ph(__m512h __V) {
3280 return __builtin_ia32_reduce_fmax_ph512(__V);
3284_mm512_reduce_min_ph(__m512h __V) {
3285 return __builtin_ia32_reduce_fmin_ph512(__V);
3289_mm512_mask_blend_ph(
__mmask32 __U, __m512h __A, __m512h __W) {
3290 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3295_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3301_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3306#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3311 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3313 _mm512_maskz_fmul_round_pch(U, A, B, R)
3315#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3320 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3322 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3324#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3329 _mm_mask_fmul_round_sch(W, U, A, B, R)
3330#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3332#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3337 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3339 _mm_maskz_fcmul_round_sch(U, A, B, R)
3341#undef __DEFAULT_FN_ATTRS128
3342#undef __DEFAULT_FN_ATTRS256
3343#undef __DEFAULT_FN_ATTRS512
3344#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3345#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3346#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
#define _MM_FROUND_CUR_DIRECTION
static __inline__ void int __a
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.