10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
45_mm512_cvtsh_h(__m512h
__a) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
54_mm256_setzero_ph(
void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
60 return (__m256h)__builtin_ia32_undef256();
64_mm512_setzero_ph(
void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
71 return (__m128h)__builtin_ia32_undef128();
75 return (__m512h)__builtin_ia32_undef512();
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
116_mm512_set1_pch(
_Float16 _Complex __h) {
149_mm256_castph_si256(__m256h
__a) {
154_mm512_castph_si512(__m512h
__a) {
187_mm256_castsi256_ph(__m256i
__a) {
192_mm512_castsi512_ph(__m512i
__a) {
197_mm256_castph256_ph128(__m256h
__a) {
198 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
202_mm512_castph512_ph128(__m512h
__a) {
203 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7);
207_mm512_castph512_ph256(__m512h
__a) {
208 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
213_mm256_castph128_ph256(__m128h
__a) {
214 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
219_mm512_castph128_ph512(__m128h
__a) {
220 __m256h
__b = __builtin_nondeterministic_value(
__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
229_mm512_castph256_ph512(__m256h
__a) {
230 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
250_mm256_zextph128_ph256(__m128h
__a) {
251 return __builtin_shufflevector(
__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
269_mm512_zextph128_ph512(__m128h
__a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
289_mm512_zextph256_ph512(__m256h
__a) {
290 return __builtin_shufflevector(
__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OS,
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OS,
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OS,
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OS,
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OS,
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_US,
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_EQ_OQ,
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LT_OQ,
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_LE_OQ,
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GT_OQ,
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_GE_OQ,
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B,
_CMP_NEQ_UQ,
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
380_mm512_mask_add_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (
__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
386_mm512_maskz_add_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
412_mm512_mask_sub_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (
__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
418_mm512_maskz_sub_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
444_mm512_mask_mul_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (
__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
450_mm512_maskz_mul_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
476_mm512_mask_div_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (
__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
482_mm512_maskz_div_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
509_mm512_mask_min_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (
__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
515_mm512_maskz_min_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
542_mm512_mask_max_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (
__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
548_mm512_maskz_max_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
569_mm512_abs_ph(__m512h __A) {
579_mm512_mask_conj_pch(__m512h __W,
__mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (
__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
585_mm512_maskz_conj_pch(
__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((
__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
601 __A = _mm_add_sh(__A, __B);
602 return __builtin_ia32_selectsh_128(__U, __A, __W);
608 __A = _mm_add_sh(__A, __B);
609 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
612#define _mm_add_round_sh(A, B, R) \
613 ((__m128h)__builtin_ia32_addsh_round_mask( \
614 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
615 (__mmask8)-1, (int)(R)))
617#define _mm_mask_add_round_sh(W, U, A, B, R) \
618 ((__m128h)__builtin_ia32_addsh_round_mask( \
619 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
620 (__mmask8)(U), (int)(R)))
622#define _mm_maskz_add_round_sh(U, A, B, R) \
623 ((__m128h)__builtin_ia32_addsh_round_mask( \
624 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
625 (__mmask8)(U), (int)(R)))
637 __A = _mm_sub_sh(__A, __B);
638 return __builtin_ia32_selectsh_128(__U, __A, __W);
644 __A = _mm_sub_sh(__A, __B);
645 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
648#define _mm_sub_round_sh(A, B, R) \
649 ((__m128h)__builtin_ia32_subsh_round_mask( \
650 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
651 (__mmask8)-1, (int)(R)))
653#define _mm_mask_sub_round_sh(W, U, A, B, R) \
654 ((__m128h)__builtin_ia32_subsh_round_mask( \
655 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
656 (__mmask8)(U), (int)(R)))
658#define _mm_maskz_sub_round_sh(U, A, B, R) \
659 ((__m128h)__builtin_ia32_subsh_round_mask( \
660 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
661 (__mmask8)(U), (int)(R)))
673 __A = _mm_mul_sh(__A, __B);
674 return __builtin_ia32_selectsh_128(__U, __A, __W);
680 __A = _mm_mul_sh(__A, __B);
681 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
684#define _mm_mul_round_sh(A, B, R) \
685 ((__m128h)__builtin_ia32_mulsh_round_mask( \
686 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
687 (__mmask8)-1, (int)(R)))
689#define _mm_mask_mul_round_sh(W, U, A, B, R) \
690 ((__m128h)__builtin_ia32_mulsh_round_mask( \
691 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
692 (__mmask8)(U), (int)(R)))
694#define _mm_maskz_mul_round_sh(U, A, B, R) \
695 ((__m128h)__builtin_ia32_mulsh_round_mask( \
696 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
697 (__mmask8)(U), (int)(R)))
709 __A = _mm_div_sh(__A, __B);
710 return __builtin_ia32_selectsh_128(__U, __A, __W);
716 __A = _mm_div_sh(__A, __B);
717 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
720#define _mm_div_round_sh(A, B, R) \
721 ((__m128h)__builtin_ia32_divsh_round_mask( \
722 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
723 (__mmask8)-1, (int)(R)))
725#define _mm_mask_div_round_sh(W, U, A, B, R) \
726 ((__m128h)__builtin_ia32_divsh_round_mask( \
727 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
728 (__mmask8)(U), (int)(R)))
730#define _mm_maskz_div_round_sh(U, A, B, R) \
731 ((__m128h)__builtin_ia32_divsh_round_mask( \
732 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
733 (__mmask8)(U), (int)(R)))
737 return (__m128h)__builtin_ia32_minsh_round_mask(
738 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
746 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
754 return (__m128h)__builtin_ia32_minsh_round_mask(
755 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
759#define _mm_min_round_sh(A, B, R) \
760 ((__m128h)__builtin_ia32_minsh_round_mask( \
761 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
762 (__mmask8)-1, (int)(R)))
764#define _mm_mask_min_round_sh(W, U, A, B, R) \
765 ((__m128h)__builtin_ia32_minsh_round_mask( \
766 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
767 (__mmask8)(U), (int)(R)))
769#define _mm_maskz_min_round_sh(U, A, B, R) \
770 ((__m128h)__builtin_ia32_minsh_round_mask( \
771 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
772 (__mmask8)(U), (int)(R)))
776 return (__m128h)__builtin_ia32_maxsh_round_mask(
777 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
785 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
793 return (__m128h)__builtin_ia32_maxsh_round_mask(
794 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
798#define _mm_max_round_sh(A, B, R) \
799 ((__m128h)__builtin_ia32_maxsh_round_mask( \
800 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
801 (__mmask8)-1, (int)(R)))
803#define _mm_mask_max_round_sh(W, U, A, B, R) \
804 ((__m128h)__builtin_ia32_maxsh_round_mask( \
805 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
806 (__mmask8)(U), (int)(R)))
808#define _mm_maskz_max_round_sh(U, A, B, R) \
809 ((__m128h)__builtin_ia32_maxsh_round_mask( \
810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
811 (__mmask8)(U), (int)(R)))
813#define _mm512_cmp_round_ph_mask(A, B, P, R) \
814 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
815 (__v32hf)(__m512h)(B), (int)(P), \
816 (__mmask32)-1, (int)(R)))
818#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
819 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
820 (__v32hf)(__m512h)(B), (int)(P), \
821 (__mmask32)(U), (int)(R)))
823#define _mm512_cmp_ph_mask(A, B, P) \
824 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
826#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
827 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
829#define _mm_cmp_round_sh_mask(X, Y, P, R) \
830 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
831 (__v8hf)(__m128h)(Y), (int)(P), \
832 (__mmask8)-1, (int)(R)))
834#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
835 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
836 (__v8hf)(__m128h)(Y), (int)(P), \
837 (__mmask8)(M), (int)(R)))
839#define _mm_cmp_sh_mask(X, Y, P) \
840 ((__mmask8)__builtin_ia32_cmpsh_mask( \
841 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
842 _MM_FROUND_CUR_DIRECTION))
844#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
845 ((__mmask8)__builtin_ia32_cmpsh_mask( \
846 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
847 _MM_FROUND_CUR_DIRECTION))
850 struct __mm_load_sh_struct {
853 _Float16 __u = ((
const struct __mm_load_sh_struct *)__dp)->__u;
854 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
858_mm_mask_load_sh(__m128h __W,
__mmask8 __U,
const void *__A) {
859 __m128h src = (__v8hf)__builtin_shufflevector(
860 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
862 return (__m128h)__builtin_ia32_loadsh128_mask((
const __v8hf *)__A, src, __U & 1);
866_mm_maskz_load_sh(
__mmask8 __U,
const void *__A) {
867 return (__m128h)__builtin_ia32_loadsh128_mask(
868 (
const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
872_mm512_load_ph(
void const *
__p) {
873 return *(
const __m512h *)
__p;
877_mm256_load_ph(
void const *
__p) {
878 return *(
const __m256h *)
__p;
882 return *(
const __m128h *)
__p;
886_mm512_loadu_ph(
void const *
__p) {
890 return ((
const struct __loadu_ph *)
__p)->__v;
894_mm256_loadu_ph(
void const *
__p) {
898 return ((
const struct __loadu_ph *)
__p)->__v;
905 return ((
const struct __loadu_ph *)
__p)->__v;
911 struct __mm_store_sh_struct {
914 ((
struct __mm_store_sh_struct *)__dp)->__u =
__a[0];
920 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
925 *(__m512h *)
__P = __A;
930 *(__m256h *)
__P = __A;
935 *(__m128h *)
__P = __A;
943 ((
struct __storeu_ph *)
__P)->
__v = __A;
951 ((
struct __storeu_ph *)
__P)->
__v = __A;
959 ((
struct __storeu_ph *)
__P)->
__v = __A;
973 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
979 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
985 return (__m128i)(__v8hi){
__a, 0, 0, 0, 0, 0, 0, 0};
994 return (__m512h)__builtin_ia32_rcpph512_mask(
995 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
999_mm512_mask_rcp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1000 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
1005_mm512_maskz_rcp_ph(
__mmask32 __U, __m512h __A) {
1006 return (__m512h)__builtin_ia32_rcpph512_mask(
1007 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1011 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1012 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1);
1016_mm512_mask_rsqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1017 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1022_mm512_maskz_rsqrt_ph(
__mmask32 __U, __m512h __A) {
1023 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1024 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U);
1027#define _mm512_getmant_ph(A, B, C) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1030 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1031 _MM_FROUND_CUR_DIRECTION))
1033#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1034 ((__m512h)__builtin_ia32_getmantph512_mask( \
1035 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1036 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1038#define _mm512_maskz_getmant_ph(U, A, B, C) \
1039 ((__m512h)__builtin_ia32_getmantph512_mask( \
1040 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1041 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1043#define _mm512_getmant_round_ph(A, B, C, R) \
1044 ((__m512h)__builtin_ia32_getmantph512_mask( \
1045 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1046 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1048#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1049 ((__m512h)__builtin_ia32_getmantph512_mask( \
1050 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1051 (__mmask32)(U), (int)(R)))
1053#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1054 ((__m512h)__builtin_ia32_getmantph512_mask( \
1055 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1056 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1059 return (__m512h)__builtin_ia32_getexpph512_mask(
1060 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1065_mm512_mask_getexp_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1066 return (__m512h)__builtin_ia32_getexpph512_mask(
1071_mm512_maskz_getexp_ph(
__mmask32 __U, __m512h __A) {
1072 return (__m512h)__builtin_ia32_getexpph512_mask(
1073 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1077#define _mm512_getexp_round_ph(A, R) \
1078 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1079 (__v32hf)_mm512_undefined_ph(), \
1080 (__mmask32)-1, (int)(R)))
1082#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1083 ((__m512h)__builtin_ia32_getexpph512_mask( \
1084 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1086#define _mm512_maskz_getexp_round_ph(U, A, R) \
1087 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1088 (__v32hf)_mm512_setzero_ph(), \
1089 (__mmask32)(U), (int)(R)))
1093 return (__m512h)__builtin_ia32_scalefph512_mask(
1094 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (
__mmask32)-1,
1099_mm512_mask_scalef_ph(__m512h __W,
__mmask32 __U, __m512h __A, __m512h __B) {
1100 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1106_mm512_maskz_scalef_ph(
__mmask32 __U, __m512h __A, __m512h __B) {
1107 return (__m512h)__builtin_ia32_scalefph512_mask(
1108 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1112#define _mm512_scalef_round_ph(A, B, R) \
1113 ((__m512h)__builtin_ia32_scalefph512_mask( \
1114 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1115 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1117#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1118 ((__m512h)__builtin_ia32_scalefph512_mask( \
1119 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1120 (__mmask32)(U), (int)(R)))
1122#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1123 ((__m512h)__builtin_ia32_scalefph512_mask( \
1124 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1125 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1127#define _mm512_roundscale_ph(A, B) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1129 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1130 _MM_FROUND_CUR_DIRECTION))
1132#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1133 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1134 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1135 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1137#define _mm512_maskz_roundscale_ph(A, B, imm) \
1138 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1139 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1140 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1142#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1143 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1144 (__v32hf)(__m512h)(A), \
1145 (__mmask32)(B), (int)(R)))
1147#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1148 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1149 (__v32hf)_mm512_setzero_ph(), \
1150 (__mmask32)(A), (int)(R)))
1152#define _mm512_roundscale_round_ph(A, imm, R) \
1153 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154 (__v32hf)_mm512_undefined_ph(), \
1155 (__mmask32)-1, (int)(R)))
1157#define _mm512_reduce_ph(A, imm) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask( \
1159 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1160 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1162#define _mm512_mask_reduce_ph(W, U, A, imm) \
1163 ((__m512h)__builtin_ia32_reduceph512_mask( \
1164 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1165 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1167#define _mm512_maskz_reduce_ph(U, A, imm) \
1168 ((__m512h)__builtin_ia32_reduceph512_mask( \
1169 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1170 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1172#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1173 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1174 (__v32hf)(__m512h)(W), \
1175 (__mmask32)(U), (int)(R)))
1177#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1178 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1179 (__v32hf)_mm512_setzero_ph(), \
1180 (__mmask32)(U), (int)(R)))
1182#define _mm512_reduce_round_ph(A, imm, R) \
1183 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1184 (__v32hf)_mm512_undefined_ph(), \
1185 (__mmask32)-1, (int)(R)))
1189 return (__m128h)__builtin_ia32_rcpsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1197 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1204 return (__m128h)__builtin_ia32_rcpsh_mask(
1205 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1210 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1211 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1);
1218 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1223_mm_maskz_rsqrt_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1224 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1225 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U);
1228#define _mm_getmant_round_sh(A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1233#define _mm_getmant_sh(A, B, C, D) \
1234 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1236 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1238#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1239 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1240 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1241 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1243#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1244 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1245 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1246 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1248#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1249 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1250 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1251 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1253#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1254 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1255 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1256 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1258#define _mm_getexp_round_sh(A, B, R) \
1259 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1260 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1261 (__mmask8)-1, (int)(R)))
1265 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1266 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1271_mm_mask_getexp_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1272 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1273 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (
__mmask8)__U,
1277#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1278 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1279 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1280 (__mmask8)(U), (int)(R)))
1283_mm_maskz_getexp_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1284 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1285 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1289#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1290 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1291 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1292 (__mmask8)(U), (int)(R)))
1294#define _mm_scalef_round_sh(A, B, R) \
1295 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1296 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1297 (__mmask8)-1, (int)(R)))
1301 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1302 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1307_mm_mask_scalef_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
1308 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1313#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1314 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1315 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1316 (__mmask8)(U), (int)(R)))
1319_mm_maskz_scalef_sh(
__mmask8 __U, __m128h __A, __m128h __B) {
1320 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1321 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1325#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1326 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1328 (__mmask8)(U), (int)(R)))
1330#define _mm_roundscale_round_sh(A, B, imm, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1333 (__mmask8)-1, (int)(imm), (int)(R)))
1335#define _mm_roundscale_sh(A, B, imm) \
1336 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1340#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1341 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1343 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1345#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1346 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1348 (__mmask8)(U), (int)(I), (int)(R)))
1350#define _mm_maskz_roundscale_sh(U, A, B, I) \
1351 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1353 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1355#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1356 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1358 (__mmask8)(U), (int)(I), (int)(R)))
1360#define _mm_reduce_sh(A, B, C) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1365#define _mm_mask_reduce_sh(W, U, A, B, C) \
1366 ((__m128h)__builtin_ia32_reducesh_mask( \
1367 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1368 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1370#define _mm_maskz_reduce_sh(U, A, B, C) \
1371 ((__m128h)__builtin_ia32_reducesh_mask( \
1372 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1373 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1375#define _mm_reduce_round_sh(A, B, C, R) \
1376 ((__m128h)__builtin_ia32_reducesh_mask( \
1377 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1378 (__mmask8)-1, (int)(C), (int)(R)))
1380#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1381 ((__m128h)__builtin_ia32_reducesh_mask( \
1382 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1383 (__mmask8)(U), (int)(C), (int)(R)))
1385#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1386 ((__m128h)__builtin_ia32_reducesh_mask( \
1387 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1388 (__mmask8)(U), (int)(C), (int)(R)))
1390#define _mm512_sqrt_round_ph(A, R) \
1391 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1393#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1394 ((__m512h)__builtin_ia32_selectph_512( \
1395 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1396 (__v32hf)(__m512h)(W)))
1398#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1399 ((__m512h)__builtin_ia32_selectph_512( \
1400 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1401 (__v32hf)_mm512_setzero_ph()))
1404 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1409_mm512_mask_sqrt_ph(__m512h __W,
__mmask32 __U, __m512h __A) {
1410 return (__m512h)__builtin_ia32_selectph_512(
1413 (__v32hf)(__m512h)(__W));
1417_mm512_maskz_sqrt_ph(
__mmask32 __U, __m512h __A) {
1418 return (__m512h)__builtin_ia32_selectph_512(
1421 (__v32hf)_mm512_setzero_ph());
1424#define _mm_sqrt_round_sh(A, B, R) \
1425 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1426 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1427 (__mmask8)-1, (int)(R)))
1429#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1430 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1431 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1432 (__mmask8)(U), (int)(R)))
1434#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1435 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1436 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1437 (__mmask8)(U), (int)(R)))
1441 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1442 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1450 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1451 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1458 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1459 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1463#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1464 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1465 (int)(imm), (__mmask32)(U)))
1467#define _mm512_fpclass_ph_mask(A, imm) \
1468 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1469 (int)(imm), (__mmask32)-1))
1471#define _mm_fpclass_sh_mask(A, imm) \
1472 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1475#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1476 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1479#define _mm512_cvt_roundpd_ph(A, R) \
1480 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1481 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1483#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1484 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1485 (__mmask8)(U), (int)(R)))
1487#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1488 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1489 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1492 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1493 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
1498_mm512_mask_cvtpd_ph(__m128h __W,
__mmask8 __U, __m512d __A) {
1499 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1504_mm512_maskz_cvtpd_ph(
__mmask8 __U, __m512d __A) {
1505 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1506 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1510#define _mm512_cvt_roundph_pd(A, R) \
1511 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1512 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1514#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1515 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1516 (__mmask8)(U), (int)(R)))
1518#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1519 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1520 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1523 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1529_mm512_mask_cvtph_pd(__m512d __W,
__mmask8 __U, __m128h __A) {
1530 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1535_mm512_maskz_cvtph_pd(
__mmask8 __U, __m128h __A) {
1536 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1541#define _mm_cvt_roundsh_ss(A, B, R) \
1542 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1543 (__v4sf)_mm_undefined_ps(), \
1544 (__mmask8)(-1), (int)(R)))
1546#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1547 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1548 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1550#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1551 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1552 (__v4sf)_mm_setzero_ps(), \
1553 (__mmask8)(U), (int)(R)))
1557 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1566 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1574 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1579#define _mm_cvt_roundss_sh(A, B, R) \
1580 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1581 (__v8hf)_mm_undefined_ph(), \
1582 (__mmask8)(-1), (int)(R)))
1584#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1585 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1586 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1588#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1589 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1590 (__v8hf)_mm_setzero_ph(), \
1591 (__mmask8)(U), (int)(R)))
1595 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1596 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1604 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1605 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (
__mmask8)__U,
1612 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1613 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1617#define _mm_cvt_roundsd_sh(A, B, R) \
1618 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1619 (__v8hf)_mm_undefined_ph(), \
1620 (__mmask8)(-1), (int)(R)))
1622#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1623 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1624 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1626#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1627 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1628 (__v8hf)_mm_setzero_ph(), \
1629 (__mmask8)(U), (int)(R)))
1633 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1634 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (
__mmask8)-1,
1642 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1643 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (
__mmask8)__U,
1648_mm_maskz_cvtsd_sh(
__mmask8 __U, __m128h __A, __m128d __B) {
1649 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1650 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
1654#define _mm_cvt_roundsh_sd(A, B, R) \
1655 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1656 (__v2df)_mm_undefined_pd(), \
1657 (__mmask8)(-1), (int)(R)))
1659#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1660 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1661 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1663#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1664 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1665 (__v2df)_mm_setzero_pd(), \
1666 (__mmask8)(U), (int)(R)))
1670 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1679 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1680 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (
__mmask8)__U,
1685_mm_maskz_cvtsh_sd(
__mmask8 __U, __m128d __A, __m128h __B) {
1686 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1691#define _mm512_cvt_roundph_epi16(A, R) \
1692 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1693 (__v32hi)_mm512_undefined_epi32(), \
1694 (__mmask32)(-1), (int)(R)))
1696#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1697 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1698 (__mmask32)(U), (int)(R)))
1700#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1701 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1702 (__v32hi)_mm512_setzero_epi32(), \
1703 (__mmask32)(U), (int)(R)))
1706_mm512_cvtph_epi16(__m512h __A) {
1707 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1713_mm512_mask_cvtph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1714 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1719_mm512_maskz_cvtph_epi16(
__mmask32 __U, __m512h __A) {
1720 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1725#define _mm512_cvtt_roundph_epi16(A, R) \
1726 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1727 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1730#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1731 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1732 (__mmask32)(U), (int)(R)))
1734#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1735 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1736 (__v32hi)_mm512_setzero_epi32(), \
1737 (__mmask32)(U), (int)(R)))
1740_mm512_cvttph_epi16(__m512h __A) {
1741 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1747_mm512_mask_cvttph_epi16(__m512i __W,
__mmask32 __U, __m512h __A) {
1748 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1753_mm512_maskz_cvttph_epi16(
__mmask32 __U, __m512h __A) {
1754 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1759#define _mm512_cvt_roundepi16_ph(A, R) \
1760 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1761 (__v32hf)_mm512_undefined_ph(), \
1762 (__mmask32)(-1), (int)(R)))
1764#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1765 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1766 (__mmask32)(U), (int)(R)))
1768#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1769 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1770 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1773_mm512_cvtepi16_ph(__m512i __A) {
1774 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1775 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1780_mm512_mask_cvtepi16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1781 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1786_mm512_maskz_cvtepi16_ph(
__mmask32 __U, __m512i __A) {
1787 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1788 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1792#define _mm512_cvt_roundph_epu16(A, R) \
1793 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1794 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1797#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1798 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1799 (__mmask32)(U), (int)(R)))
1801#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1802 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1803 (__v32hu)_mm512_setzero_epi32(), \
1804 (__mmask32)(U), (int)(R)))
1807_mm512_cvtph_epu16(__m512h __A) {
1808 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1814_mm512_mask_cvtph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1815 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1820_mm512_maskz_cvtph_epu16(
__mmask32 __U, __m512h __A) {
1821 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1826#define _mm512_cvtt_roundph_epu16(A, R) \
1827 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1828 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1831#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1832 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1833 (__mmask32)(U), (int)(R)))
1835#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1836 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1837 (__v32hu)_mm512_setzero_epi32(), \
1838 (__mmask32)(U), (int)(R)))
1841_mm512_cvttph_epu16(__m512h __A) {
1842 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1848_mm512_mask_cvttph_epu16(__m512i __W,
__mmask32 __U, __m512h __A) {
1849 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1854_mm512_maskz_cvttph_epu16(
__mmask32 __U, __m512h __A) {
1855 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1860#define _mm512_cvt_roundepu16_ph(A, R) \
1861 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1862 (__v32hf)_mm512_undefined_ph(), \
1863 (__mmask32)(-1), (int)(R)))
1865#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1866 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1867 (__mmask32)(U), (int)(R)))
1869#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1870 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1871 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1874_mm512_cvtepu16_ph(__m512i __A) {
1875 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1876 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)-1,
1881_mm512_mask_cvtepu16_ph(__m512h __W,
__mmask32 __U, __m512i __A) {
1882 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1887_mm512_maskz_cvtepu16_ph(
__mmask32 __U, __m512i __A) {
1888 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1889 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (
__mmask32)__U,
1893#define _mm512_cvt_roundph_epi32(A, R) \
1894 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1895 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1898#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1899 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1900 (__mmask16)(U), (int)(R)))
1902#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1904 (__v16si)_mm512_setzero_epi32(), \
1905 (__mmask16)(U), (int)(R)))
1908_mm512_cvtph_epi32(__m256h __A) {
1909 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1915_mm512_mask_cvtph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
1916 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1921_mm512_maskz_cvtph_epi32(
__mmask16 __U, __m256h __A) {
1922 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1927#define _mm512_cvt_roundph_epu32(A, R) \
1928 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1929 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1932#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1933 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1934 (__mmask16)(U), (int)(R)))
1936#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1937 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1938 (__v16su)_mm512_setzero_epi32(), \
1939 (__mmask16)(U), (int)(R)))
1942_mm512_cvtph_epu32(__m256h __A) {
1943 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1949_mm512_mask_cvtph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
1950 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1955_mm512_maskz_cvtph_epu32(
__mmask16 __U, __m256h __A) {
1956 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1961#define _mm512_cvt_roundepi32_ph(A, R) \
1962 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1963 (__v16hf)_mm256_undefined_ph(), \
1964 (__mmask16)(-1), (int)(R)))
1966#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1967 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1968 (__mmask16)(U), (int)(R)))
1970#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1971 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1972 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1975_mm512_cvtepi32_ph(__m512i __A) {
1976 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1977 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
1982_mm512_mask_cvtepi32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
1983 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1988_mm512_maskz_cvtepi32_ph(
__mmask16 __U, __m512i __A) {
1989 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1990 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
1994#define _mm512_cvt_roundepu32_ph(A, R) \
1995 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1996 (__v16hf)_mm256_undefined_ph(), \
1997 (__mmask16)(-1), (int)(R)))
1999#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
2000 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
2001 (__mmask16)(U), (int)(R)))
2003#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
2004 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
2005 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2008_mm512_cvtepu32_ph(__m512i __A) {
2009 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2010 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2015_mm512_mask_cvtepu32_ph(__m256h __W,
__mmask16 __U, __m512i __A) {
2016 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2021_mm512_maskz_cvtepu32_ph(
__mmask16 __U, __m512i __A) {
2022 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2023 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2027#define _mm512_cvtt_roundph_epi32(A, R) \
2028 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2029 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2032#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2033 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2034 (__mmask16)(U), (int)(R)))
2036#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2038 (__v16si)_mm512_setzero_epi32(), \
2039 (__mmask16)(U), (int)(R)))
2042_mm512_cvttph_epi32(__m256h __A) {
2043 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2049_mm512_mask_cvttph_epi32(__m512i __W,
__mmask16 __U, __m256h __A) {
2050 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2055_mm512_maskz_cvttph_epi32(
__mmask16 __U, __m256h __A) {
2056 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2061#define _mm512_cvtt_roundph_epu32(A, R) \
2062 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2063 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2066#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2067 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2068 (__mmask16)(U), (int)(R)))
2070#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2071 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2072 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2076_mm512_cvttph_epu32(__m256h __A) {
2077 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2083_mm512_mask_cvttph_epu32(__m512i __W,
__mmask16 __U, __m256h __A) {
2084 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2089_mm512_maskz_cvttph_epu32(
__mmask16 __U, __m256h __A) {
2090 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2095#define _mm512_cvt_roundepi64_ph(A, R) \
2096 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2097 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2099#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2100 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2101 (__mmask8)(U), (int)(R)))
2103#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2104 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2105 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2108_mm512_cvtepi64_ph(__m512i __A) {
2109 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2110 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2115_mm512_mask_cvtepi64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2116 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2121_mm512_maskz_cvtepi64_ph(
__mmask8 __U, __m512i __A) {
2122 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2123 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2127#define _mm512_cvt_roundph_epi64(A, R) \
2128 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2129 (__v8di)_mm512_undefined_epi32(), \
2130 (__mmask8)(-1), (int)(R)))
2132#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2133 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2134 (__mmask8)(U), (int)(R)))
2136#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2137 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2138 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2141_mm512_cvtph_epi64(__m128h __A) {
2142 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2148_mm512_mask_cvtph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2149 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2154_mm512_maskz_cvtph_epi64(
__mmask8 __U, __m128h __A) {
2155 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2160#define _mm512_cvt_roundepu64_ph(A, R) \
2161 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2162 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2164#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2165 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2166 (__mmask8)(U), (int)(R)))
2168#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2169 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2170 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2173_mm512_cvtepu64_ph(__m512i __A) {
2174 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2175 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)-1,
2180_mm512_mask_cvtepu64_ph(__m128h __W,
__mmask8 __U, __m512i __A) {
2181 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2186_mm512_maskz_cvtepu64_ph(
__mmask8 __U, __m512i __A) {
2187 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2188 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (
__mmask8)__U,
2192#define _mm512_cvt_roundph_epu64(A, R) \
2193 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2194 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2197#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2198 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2199 (__mmask8)(U), (int)(R)))
2201#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2202 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2203 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2206_mm512_cvtph_epu64(__m128h __A) {
2207 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2213_mm512_mask_cvtph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2214 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2219_mm512_maskz_cvtph_epu64(
__mmask8 __U, __m128h __A) {
2220 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2225#define _mm512_cvtt_roundph_epi64(A, R) \
2226 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2227 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2230#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2231 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2232 (__mmask8)(U), (int)(R)))
2234#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2235 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2236 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2239_mm512_cvttph_epi64(__m128h __A) {
2240 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2246_mm512_mask_cvttph_epi64(__m512i __W,
__mmask8 __U, __m128h __A) {
2247 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2252_mm512_maskz_cvttph_epi64(
__mmask8 __U, __m128h __A) {
2253 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2258#define _mm512_cvtt_roundph_epu64(A, R) \
2259 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2260 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2263#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2264 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2265 (__mmask8)(U), (int)(R)))
2267#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2268 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2269 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2272_mm512_cvttph_epu64(__m128h __A) {
2273 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2279_mm512_mask_cvttph_epu64(__m512i __W,
__mmask8 __U, __m128h __A) {
2280 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2285_mm512_maskz_cvttph_epu64(
__mmask8 __U, __m128h __A) {
2286 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2291#define _mm_cvt_roundsh_i32(A, R) \
2292 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2298#define _mm_cvt_roundsh_u32(A, R) \
2299 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2302_mm_cvtsh_u32(__m128h __A) {
2303 return (
unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2308#define _mm_cvt_roundsh_i64(A, R) \
2309 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2312 return (
long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2316#define _mm_cvt_roundsh_u64(A, R) \
2317 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2320_mm_cvtsh_u64(__m128h __A) {
2321 return (
unsigned long long)__builtin_ia32_vcvtsh2usi64(
2326#define _mm_cvt_roundu32_sh(A, B, R) \
2327 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2330_mm_cvtu32_sh(__m128h __A,
unsigned int __B) {
2336#define _mm_cvt_roundu64_sh(A, B, R) \
2337 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2341_mm_cvtu64_sh(__m128h __A,
unsigned long long __B) {
2347#define _mm_cvt_roundi32_sh(A, B, R) \
2348 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2357#define _mm_cvt_roundi64_sh(A, B, R) \
2358 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2367#define _mm_cvtt_roundsh_i32(A, R) \
2368 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2371 return (
int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2376#define _mm_cvtt_roundsh_i64(A, R) \
2377 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2380 return (
long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2385#define _mm_cvtt_roundsh_u32(A, R) \
2386 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2389_mm_cvttsh_u32(__m128h __A) {
2390 return (
unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2395#define _mm_cvtt_roundsh_u64(A, R) \
2396 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2399_mm_cvttsh_u64(__m128h __A) {
2400 return (
unsigned long long)__builtin_ia32_vcvttsh2usi64(
2405#define _mm512_cvtx_roundph_ps(A, R) \
2406 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2407 (__v16sf)_mm512_undefined_ps(), \
2408 (__mmask16)(-1), (int)(R)))
2410#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2411 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2412 (__mmask16)(U), (int)(R)))
2414#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2415 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2416 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2419 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2425_mm512_mask_cvtxph_ps(__m512 __W,
__mmask16 __U, __m256h __A) {
2426 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2431_mm512_maskz_cvtxph_ps(
__mmask16 __U, __m256h __A) {
2432 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2437#define _mm512_cvtx_roundps_ph(A, R) \
2438 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2439 (__v16hf)_mm256_undefined_ph(), \
2440 (__mmask16)(-1), (int)(R)))
2442#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2443 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2444 (__mmask16)(U), (int)(R)))
2446#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2447 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2448 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2451 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2452 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)-1,
2457_mm512_mask_cvtxps_ph(__m256h __W,
__mmask16 __U, __m512 __A) {
2458 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2463_mm512_maskz_cvtxps_ph(
__mmask16 __U, __m512 __A) {
2464 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2465 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (
__mmask16)__U,
2469#define _mm512_fmadd_round_ph(A, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2472 (__mmask32)-1, (int)(R)))
2474#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2479#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2481 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)(U), (int)(R)))
2484#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2486 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2489#define _mm512_fmsub_round_ph(A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2491 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2492 (__mmask32)-1, (int)(R)))
2494#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)(U), (int)(R)))
2499#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2504#define _mm512_fnmadd_round_ph(A, B, C, R) \
2505 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2506 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2507 (__mmask32)-1, (int)(R)))
2509#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2510 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2511 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2512 (__mmask32)(U), (int)(R)))
2514#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2515 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2516 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2517 (__mmask32)(U), (int)(R)))
2519#define _mm512_fnmsub_round_ph(A, B, C, R) \
2520 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2521 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2522 (__mmask32)-1, (int)(R)))
2524#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2525 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2526 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2527 (__mmask32)(U), (int)(R)))
2532 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2538_mm512_mask_fmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2539 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2545_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2546 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2552_mm512_maskz_fmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2553 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2561 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2567_mm512_mask_fmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2568 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2574_mm512_maskz_fmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2575 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2576 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2583 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2589_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2590 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2596_mm512_maskz_fnmadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2597 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2605 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2611_mm512_maskz_fnmsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2612 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2613 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2617#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2620 (__mmask32)-1, (int)(R)))
2622#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2627#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2628 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2629 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2630 (__mmask32)(U), (int)(R)))
2632#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2633 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2634 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2635 (__mmask32)(U), (int)(R)))
2637#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2638 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2639 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2640 (__mmask32)-1, (int)(R)))
2642#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2643 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2644 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2645 (__mmask32)(U), (int)(R)))
2647#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2648 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2649 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2650 (__mmask32)(U), (int)(R)))
2653_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2654 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2655 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)-1,
2660_mm512_mask_fmaddsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2661 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2662 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2667_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2668 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2669 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2674_mm512_maskz_fmaddsub_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2675 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2676 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2681_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2682 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2683 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)-1,
2688_mm512_mask_fmsubadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2689 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2690 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2695_mm512_maskz_fmsubadd_ph(
__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2696 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2697 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (
__mmask32)__U,
2701#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2702 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2703 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2704 (__mmask32)(U), (int)(R)))
2707_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2708 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2713#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2714 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2715 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2716 (__mmask32)(U), (int)(R)))
2719_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2720 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2721 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (
__mmask32)__U,
2725#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2726 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2727 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2728 (__mmask32)(U), (int)(R)))
2731_mm512_mask_fnmadd_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2732 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2737#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2738 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2739 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2740 (__mmask32)(U), (int)(R)))
2742#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2743 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2744 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2745 (__mmask32)(U), (int)(R)))
2748_mm512_mask_fnmsub_ph(__m512h __A,
__mmask32 __U, __m512h __B, __m512h __C) {
2749 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2755_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C,
__mmask32 __U) {
2756 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2764 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2772 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2776#define _mm_fmadd_round_sh(A, B, C, R) \
2777 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2778 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2779 (__mmask8)-1, (int)(R)))
2781#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2782 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2783 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2784 (__mmask8)(U), (int)(R)))
2787_mm_maskz_fmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2788 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2793#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2794 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2795 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2796 (__mmask8)(U), (int)(R)))
2799_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2800 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2805#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2806 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2807 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2808 (__mmask8)(U), (int)(R)))
2813 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2822 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2827#define _mm_fmsub_round_sh(A, B, C, R) \
2828 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2829 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2830 (__mmask8)-1, (int)(R)))
2832#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2833 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2834 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2835 (__mmask8)(U), (int)(R)))
2838_mm_maskz_fmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2839 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2844#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2845 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2846 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2847 (__mmask8)(U), (int)R))
2850_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2851 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)
__Y,
2856#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2857 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2858 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2859 (__mmask8)(U), (int)(R)))
2864 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2869_mm_mask_fnmadd_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2870 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2874#define _mm_fnmadd_round_sh(A, B, C, R) \
2875 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2876 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2877 (__mmask8)-1, (int)(R)))
2879#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2880 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2881 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2882 (__mmask8)(U), (int)(R)))
2885_mm_maskz_fnmadd_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2886 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2891#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2892 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2893 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2894 (__mmask8)(U), (int)(R)))
2897_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2898 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2903#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2904 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2905 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2906 (__mmask8)(U), (int)(R)))
2911 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2916_mm_mask_fnmsub_sh(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
2917 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2921#define _mm_fnmsub_round_sh(A, B, C, R) \
2922 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2923 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2924 (__mmask8)-1, (int)(R)))
2926#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2927 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2928 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2929 (__mmask8)(U), (int)(R)))
2932_mm_maskz_fnmsub_sh(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2933 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2938#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2939 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2940 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2941 (__mmask8)(U), (int)(R)))
2944_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h
__Y,
__mmask8 __U) {
2945 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)
__Y,
2950#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2951 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2952 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2953 (__mmask8)(U), (int)(R)))
2958 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2964_mm_mask_fcmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
2965 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2970_mm_maskz_fcmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2971 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2977_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
2978 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2982#define _mm_fcmadd_round_sch(A, B, C, R) \
2983 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2984 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2985 (__mmask8)-1, (int)(R)))
2987#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2988 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2989 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2990 (__mmask8)(U), (int)(R)))
2992#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2993 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2994 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2995 (__mmask8)(U), (int)(R)))
2997#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2998 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2999 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3000 (__mmask8)(U), (int)(R)))
3005 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
3011_mm_mask_fmadd_sch(__m128h __A,
__mmask8 __U, __m128h __B, __m128h __C) {
3012 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
3017_mm_maskz_fmadd_sch(
__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3018 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3024_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C,
__mmask8 __U) {
3025 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3029#define _mm_fmadd_round_sch(A, B, C, R) \
3030 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3031 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3032 (__mmask8)-1, (int)(R)))
3034#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3035 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3036 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3037 (__mmask8)(U), (int)(R)))
3039#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3040 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3041 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3042 (__mmask8)(U), (int)(R)))
3044#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3045 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3046 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3047 (__mmask8)(U), (int)(R)))
3051 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3052 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3057_mm_mask_fcmul_sch(__m128h __W,
__mmask8 __U, __m128h __A, __m128h __B) {
3058 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3064_mm_maskz_fcmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3065 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3066 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3070#define _mm_fcmul_round_sch(A, B, R) \
3071 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3072 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3073 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3075#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3076 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3077 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3078 (__mmask8)(U), (int)(R)))
3080#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3081 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3082 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3083 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3087 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3088 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (
__mmask8)-1,
3096 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3102_mm_maskz_fmul_sch(
__mmask8 __U, __m128h __A, __m128h __B) {
3103 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3104 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (
__mmask8)__U,
3108#define _mm_fmul_round_sch(A, B, R) \
3109 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3110 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3111 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3113#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3114 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3115 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3116 (__mmask8)(U), (int)(R)))
3118#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3119 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3120 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3121 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3125 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3126 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3131_mm512_mask_fcmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3132 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3138_mm512_maskz_fcmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3139 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3140 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3144#define _mm512_fcmul_round_pch(A, B, R) \
3145 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3146 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3147 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3149#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3150 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3151 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3152 (__mmask16)(U), (int)(R)))
3154#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3155 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3156 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3157 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3161 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3162 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (
__mmask16)-1,
3167_mm512_mask_fmul_pch(__m512h __W,
__mmask16 __U, __m512h __A, __m512h __B) {
3168 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3174_mm512_maskz_fmul_pch(
__mmask16 __U, __m512h __A, __m512h __B) {
3175 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3176 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (
__mmask16)__U,
3180#define _mm512_fmul_round_pch(A, B, R) \
3181 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3182 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3183 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3185#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3186 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3187 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3188 (__mmask16)(U), (int)(R)))
3190#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3191 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3192 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3193 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3198 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3199 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)-1,
3204_mm512_mask_fcmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3205 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3206 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3211_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3212 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3213 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3218_mm512_maskz_fcmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3219 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3220 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3224#define _mm512_fcmadd_round_pch(A, B, C, R) \
3225 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3226 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3227 (__mmask16)-1, (int)(R)))
3229#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3230 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3231 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3232 (__mmask16)(U), (int)(R)))
3234#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3235 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3236 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3237 (__mmask16)(U), (int)(R)))
3239#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3240 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3241 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3242 (__mmask16)(U), (int)(R)))
3247 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3253_mm512_mask_fmadd_pch(__m512h __A,
__mmask16 __U, __m512h __B, __m512h __C) {
3254 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3260_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C,
__mmask16 __U) {
3261 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3262 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3267_mm512_maskz_fmadd_pch(
__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3268 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3269 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (
__mmask16)__U,
3273#define _mm512_fmadd_round_pch(A, B, C, R) \
3274 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3275 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3276 (__mmask16)-1, (int)(R)))
3278#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3279 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3280 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3281 (__mmask16)(U), (int)(R)))
3283#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3284 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3285 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3286 (__mmask16)(U), (int)(R)))
3288#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3289 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3290 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3291 (__mmask16)(U), (int)(R)))
3294_mm512_reduce_add_ph(__m512h __W) {
3295 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3299_mm512_reduce_mul_ph(__m512h __W) {
3300 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3304_mm512_reduce_max_ph(__m512h __V) {
3305 return __builtin_ia32_reduce_fmax_ph512(__V);
3309_mm512_reduce_min_ph(__m512h __V) {
3310 return __builtin_ia32_reduce_fmin_ph512(__V);
3314_mm512_mask_blend_ph(
__mmask32 __U, __m512h __A, __m512h __W) {
3315 return (__m512h)__builtin_ia32_selectph_512((
__mmask32)__U, (__v32hf)__W,
3320_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3321 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3326_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3327 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3331#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3332#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3333#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3334#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3335#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3336 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3337#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3338 _mm512_maskz_fmul_round_pch(U, A, B, R)
3340#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3341#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3342#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3343#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3344#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3345 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3346#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3347 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3349#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3350#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3351#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3352#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3353#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3354 _mm_mask_fmul_round_sch(W, U, A, B, R)
3355#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3357#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3358#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3359#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3360#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3361#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3362 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3363#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3364 _mm_maskz_fcmul_round_sch(U, A, B, R)
3366#undef __DEFAULT_FN_ATTRS128
3367#undef __DEFAULT_FN_ATTRS256
3368#undef __DEFAULT_FN_ATTRS512
3369#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3370#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3371#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector float vector float __b
static __inline__ uint32_t volatile uint32_t * __p
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
#define _MM_FROUND_CUR_DIRECTION
static __inline__ void int __a
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.