10#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
13#ifndef __AVX512FINTRIN_H
14#define __AVX512FINTRIN_H
24typedef unsigned char __v64qu
__attribute__((__vector_size__(64)));
25typedef unsigned short __v32hu
__attribute__((__vector_size__(64)));
26typedef unsigned long long __v8du
__attribute__((__vector_size__(64)));
27typedef unsigned int __v16su
__attribute__((__vector_size__(64)));
31typedef signed char __v64qs
__attribute__((__vector_size__(64)));
33typedef float __m512
__attribute__((__vector_size__(64), __aligned__(64)));
34typedef double __m512d
__attribute__((__vector_size__(64), __aligned__(64)));
35typedef long long __m512i
__attribute__((__vector_size__(64), __aligned__(64)));
37typedef float __m512_u
__attribute__((__vector_size__(64), __aligned__(1)));
38typedef double __m512d_u
__attribute__((__vector_size__(64), __aligned__(1)));
39typedef long long __m512i_u
__attribute__((__vector_size__(64), __aligned__(1)));
45#define _MM_FROUND_TO_NEAREST_INT 0x00
46#define _MM_FROUND_TO_NEG_INF 0x01
47#define _MM_FROUND_TO_POS_INF 0x02
48#define _MM_FROUND_TO_ZERO 0x03
49#define _MM_FROUND_CUR_DIRECTION 0x04
59#define _MM_CMPINT_GE _MM_CMPINT_NLT
61#define _MM_CMPINT_GT _MM_CMPINT_NLE
170#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
171#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
172#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
179 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
182#define _mm512_setzero_epi32 _mm512_setzero_si512
187 return (__m512d)__builtin_ia32_undef512();
193 return (__m512)__builtin_ia32_undef512();
199 return (__m512)__builtin_ia32_undef512();
205 return (__m512i)__builtin_ia32_undef512();
211 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
218 return (__m512i)__builtin_ia32_selectd_512(__M,
226 return (__m512i)__builtin_ia32_selectd_512(__M,
234 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
235 0, 0, 0, 0, 0, 0, 0, 0);
241 return (__m512i)__builtin_ia32_selectq_512(__M,
250 return (__m512i)__builtin_ia32_selectq_512(__M,
259 return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
260 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
263#define _mm512_setzero _mm512_setzero_ps
268 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
274 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
275 __w, __w, __w, __w, __w, __w, __w, __w };
281 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
287 return __extension__ (__m512i)(__v64qi){
288 __w, __w, __w, __w, __w, __w, __w, __w,
289 __w, __w, __w, __w, __w, __w, __w, __w,
290 __w, __w, __w, __w, __w, __w, __w, __w,
291 __w, __w, __w, __w, __w, __w, __w, __w,
292 __w, __w, __w, __w, __w, __w, __w, __w,
293 __w, __w, __w, __w, __w, __w, __w, __w,
294 __w, __w, __w, __w, __w, __w, __w, __w,
295 __w, __w, __w, __w, __w, __w, __w, __w };
301 return __extension__ (__m512i)(__v32hi){
302 __w, __w, __w, __w, __w, __w, __w, __w,
303 __w, __w, __w, __w, __w, __w, __w, __w,
304 __w, __w, __w, __w, __w, __w, __w, __w,
305 __w, __w, __w, __w, __w, __w, __w, __w };
311 return __extension__ (__m512i)(__v16si){
312 __s, __s, __s, __s, __s, __s, __s, __s,
313 __s, __s, __s, __s, __s, __s, __s, __s };
319 return (__m512i)__builtin_ia32_selectd_512(__M,
327 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
333 return (__m512i)__builtin_ia32_selectq_512(__M,
341 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
348 return __extension__ (__m512i)(__v16si)
349 {
__D, __C, __B, __A,
__D, __C, __B, __A,
350 __D, __C, __B, __A,
__D, __C, __B, __A };
357 return __extension__ (__m512i) (__v8di)
358 {
__D, __C, __B, __A,
__D, __C, __B, __A };
364 return __extension__ (__m512d)
365 {
__D, __C, __B, __A,
__D, __C, __B, __A };
371 return __extension__ (__m512)
372 {
__D, __C, __B, __A,
__D, __C, __B, __A,
373 __D, __C, __B, __A,
__D, __C, __B, __A };
376#define _mm512_setr4_epi32(e0,e1,e2,e3) \
377 _mm512_set4_epi32((e3),(e2),(e1),(e0))
379#define _mm512_setr4_epi64(e0,e1,e2,e3) \
380 _mm512_set4_epi64((e3),(e2),(e1),(e0))
382#define _mm512_setr4_pd(e0,e1,e2,e3) \
383 _mm512_set4_pd((e3),(e2),(e1),(e0))
385#define _mm512_setr4_ps(e0,e1,e2,e3) \
386 _mm512_set4_ps((e3),(e2),(e1),(e0))
391 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
392 0, 0, 0, 0, 0, 0, 0, 0);
400 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
401 1, 2, 3, 4, 5, 6, 7);
407 return __builtin_shufflevector(
__a, __builtin_nondeterministic_value(
__a), 0,
408 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
414 return __builtin_shufflevector(
__a,
__a, 0, 1);
420 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
426 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3);
432 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
438 return (__m512) (__A);
444 return (__m512i) (__A);
450 __m256d __B = __builtin_nondeterministic_value(__B);
451 return __builtin_shufflevector(
452 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
453 __B, 0, 1, 2, 3, 4, 5, 6, 7);
459 return (__m512d) (__A);
465 return (__m512i) (__A);
471 __m256 __B = __builtin_nondeterministic_value(__B);
472 return __builtin_shufflevector(
473 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
474 __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
480 __m256i __B = __builtin_nondeterministic_value(__B);
481 return __builtin_shufflevector(
482 __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
483 __B, 0, 1, 2, 3, 4, 5, 6, 7);
489 return __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
495 return (__m512) (__A);
501 return (__m512d) (__A);
507 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
513 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
544 return __builtin_shufflevector((__v2df)
__a, (__v2df)
_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
563 return __builtin_shufflevector((__v4df)
__a, (__v4df)
_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
581 return __builtin_shufflevector((__v4sf)
__a, (__v4sf)
_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
599 return __builtin_shufflevector((__v8sf)
__a, (__v8sf)
_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
617 return __builtin_shufflevector((__v2di)
__a, (__v2di)
_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
642 return (__m512i)((__v16su)
__a & (__v16su)
__b);
648 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
663 return (__m512i)((__v8du)
__a & (__v8du)
__b);
669 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __k,
684 return (__m512i)(~(__v8du)__A & (__v8du)__B);
690 return (__m512i)(~(__v16su)__A & (__v16su)__B);
696 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
711 return (__m512i)(~(__v8du)__A & (__v8du)__B);
717 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
732 return (__m512i)((__v16su)
__a | (__v16su)
__b);
738 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
752 return (__m512i)((__v8du)
__a | (__v8du)
__b);
758 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__k,
772 return (__m512i)((__v16su)
__a ^ (__v16su)
__b);
778 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
792 return (__m512i)((__v8du)
__a ^ (__v8du)
__b);
798 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__k,
812 return (__m512i)((__v8du)
__a & (__v8du)
__b);
818 return (__m512i)((__v8du)
__a | (__v8du)
__b);
824 return (__m512i)((__v8du)
__a ^ (__v8du)
__b);
832 return (__m512d)((__v8df)
__a + (__v8df)
__b);
838 return (__m512)((__v16sf)
__a + (__v16sf)
__b);
844 return (__m512d)((__v8df)
__a * (__v8df)
__b);
850 return (__m512)((__v16sf)
__a * (__v16sf)
__b);
856 return (__m512d)((__v8df)
__a - (__v8df)
__b);
862 return (__m512)((__v16sf)
__a - (__v16sf)
__b);
868 return (__m512i) ((__v8du) __A + (__v8du) __B);
874 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
882 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
890 return (__m512i) ((__v8du) __A - (__v8du) __B);
896 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
904 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
912 return (__m512i) ((__v16su) __A + (__v16su) __B);
918 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
926 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
934 return (__m512i) ((__v16su) __A - (__v16su) __B);
940 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
948 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
953#define _mm512_max_round_pd(A, B, R) \
954 ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
955 (__v8df)(__m512d)(B), (int)(R)))
957#define _mm512_mask_max_round_pd(W, U, A, B, R) \
958 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
959 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
962#define _mm512_maskz_max_round_pd(U, A, B, R) \
963 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
964 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
965 (__v8df)_mm512_setzero_pd()))
970 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
977 return (__m512d)__builtin_ia32_selectpd_512(__U,
985 return (__m512d)__builtin_ia32_selectpd_512(__U,
990#define _mm512_max_round_ps(A, B, R) \
991 ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
992 (__v16sf)(__m512)(B), (int)(R)))
994#define _mm512_mask_max_round_ps(W, U, A, B, R) \
995 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
996 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
999#define _mm512_maskz_max_round_ps(U, A, B, R) \
1000 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1001 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
1002 (__v16sf)_mm512_setzero_ps()))
1007 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
1014 return (__m512)__builtin_ia32_selectps_512(__U,
1022 return (__m512)__builtin_ia32_selectps_512(__U,
1029 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1038 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1045#define _mm_max_round_ss(A, B, R) \
1046 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1047 (__v4sf)(__m128)(B), \
1048 (__v4sf)_mm_setzero_ps(), \
1049 (__mmask8)-1, (int)(R)))
1051#define _mm_mask_max_round_ss(W, U, A, B, R) \
1052 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1053 (__v4sf)(__m128)(B), \
1054 (__v4sf)(__m128)(W), (__mmask8)(U), \
1057#define _mm_maskz_max_round_ss(U, A, B, R) \
1058 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1059 (__v4sf)(__m128)(B), \
1060 (__v4sf)_mm_setzero_ps(), \
1061 (__mmask8)(U), (int)(R)))
1065 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1074 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1081#define _mm_max_round_sd(A, B, R) \
1082 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1083 (__v2df)(__m128d)(B), \
1084 (__v2df)_mm_setzero_pd(), \
1085 (__mmask8)-1, (int)(R)))
1087#define _mm_mask_max_round_sd(W, U, A, B, R) \
1088 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1089 (__v2df)(__m128d)(B), \
1090 (__v2df)(__m128d)(W), \
1091 (__mmask8)(U), (int)(R)))
1093#define _mm_maskz_max_round_sd(U, A, B, R) \
1094 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1095 (__v2df)(__m128d)(B), \
1096 (__v2df)_mm_setzero_pd(), \
1097 (__mmask8)(U), (int)(R)))
1099static __inline __m512i
1103 return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1109 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1117 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1125 return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1131 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1139 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1147 return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1153 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1161 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1169 return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1175 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1183 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1188#define _mm512_min_round_pd(A, B, R) \
1189 ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1190 (__v8df)(__m512d)(B), (int)(R)))
1192#define _mm512_mask_min_round_pd(W, U, A, B, R) \
1193 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1194 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1197#define _mm512_maskz_min_round_pd(U, A, B, R) \
1198 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1199 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1200 (__v8df)_mm512_setzero_pd()))
1205 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1212 return (__m512d)__builtin_ia32_selectpd_512(__U,
1220 return (__m512d)__builtin_ia32_selectpd_512(__U,
1225#define _mm512_min_round_ps(A, B, R) \
1226 ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1227 (__v16sf)(__m512)(B), (int)(R)))
1229#define _mm512_mask_min_round_ps(W, U, A, B, R) \
1230 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1231 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1234#define _mm512_maskz_min_round_ps(U, A, B, R) \
1235 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1236 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1237 (__v16sf)_mm512_setzero_ps()))
1242 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1249 return (__m512)__builtin_ia32_selectps_512(__U,
1257 return (__m512)__builtin_ia32_selectps_512(__U,
1264 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1273 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1280#define _mm_min_round_ss(A, B, R) \
1281 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1282 (__v4sf)(__m128)(B), \
1283 (__v4sf)_mm_setzero_ps(), \
1284 (__mmask8)-1, (int)(R)))
1286#define _mm_mask_min_round_ss(W, U, A, B, R) \
1287 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1288 (__v4sf)(__m128)(B), \
1289 (__v4sf)(__m128)(W), (__mmask8)(U), \
1292#define _mm_maskz_min_round_ss(U, A, B, R) \
1293 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1294 (__v4sf)(__m128)(B), \
1295 (__v4sf)_mm_setzero_ps(), \
1296 (__mmask8)(U), (int)(R)))
1300 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1309 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1316#define _mm_min_round_sd(A, B, R) \
1317 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1318 (__v2df)(__m128d)(B), \
1319 (__v2df)_mm_setzero_pd(), \
1320 (__mmask8)-1, (int)(R)))
1322#define _mm_mask_min_round_sd(W, U, A, B, R) \
1323 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1324 (__v2df)(__m128d)(B), \
1325 (__v2df)(__m128d)(W), \
1326 (__mmask8)(U), (int)(R)))
1328#define _mm_maskz_min_round_sd(U, A, B, R) \
1329 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1330 (__v2df)(__m128d)(B), \
1331 (__v2df)_mm_setzero_pd(), \
1332 (__mmask8)(U), (int)(R)))
1334static __inline __m512i
1338 return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1344 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1352 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1360 return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1366 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1374 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1382 return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1388 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1396 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1404 return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1410 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1418 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1426 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si)
__Y);
1432 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1440 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1448 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)
__Y);
1454 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1462 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1470 return (__m512i) ((__v16su) __A * (__v16su) __B);
1476 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1484 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1491 return (__m512i) ((__v8du) __A * (__v8du) __B);
1496 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1501#define _mm512_sqrt_round_pd(A, R) \
1502 ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1504#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1505 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1506 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1507 (__v8df)(__m512d)(W)))
1509#define _mm512_maskz_sqrt_round_pd(U, A, R) \
1510 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1511 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1512 (__v8df)_mm512_setzero_pd()))
1517 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1524 return (__m512d)__builtin_ia32_selectpd_512(__U,
1532 return (__m512d)__builtin_ia32_selectpd_512(__U,
1537#define _mm512_sqrt_round_ps(A, R) \
1538 ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1540#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1541 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1542 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1543 (__v16sf)(__m512)(W)))
1545#define _mm512_maskz_sqrt_round_ps(U, A, R) \
1546 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1547 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1548 (__v16sf)_mm512_setzero_ps()))
1553 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1560 return (__m512)__builtin_ia32_selectps_512(__U,
1568 return (__m512)__builtin_ia32_selectps_512(__U,
1576 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1584 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1592 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1601 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1610 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1618 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1627 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1637 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1646 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1655 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1665 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1674 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1683 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1692 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1700 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1709 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1718 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1726 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1735 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1745 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1754 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1763 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1773 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1782 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1791 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1793 (__v16sf) __A, (
unsigned short)-1,
1800 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1809 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1811 (__v8df) __A, (
unsigned char)-1,
1818 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1827 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1836 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1838 (__v16sf) __A, (
unsigned short)-1,
1845 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1847 (__v8df) __A, (
unsigned char)-1,
1854 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1863 return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1869 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1877 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1885 return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1891 return (__m512i)__builtin_ia32_selectd_512(__U,
1899 return (__m512i)__builtin_ia32_selectd_512(__U,
1907 return __builtin_ia32_selectss_128(__U, __A, __W);
1916#define _mm_add_round_ss(A, B, R) \
1917 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1918 (__v4sf)(__m128)(B), \
1919 (__v4sf)_mm_setzero_ps(), \
1920 (__mmask8)-1, (int)(R)))
1922#define _mm_mask_add_round_ss(W, U, A, B, R) \
1923 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1924 (__v4sf)(__m128)(B), \
1925 (__v4sf)(__m128)(W), (__mmask8)(U), \
1928#define _mm_maskz_add_round_ss(U, A, B, R) \
1929 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1930 (__v4sf)(__m128)(B), \
1931 (__v4sf)_mm_setzero_ps(), \
1932 (__mmask8)(U), (int)(R)))
1937 return __builtin_ia32_selectsd_128(__U, __A, __W);
1945#define _mm_add_round_sd(A, B, R) \
1946 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1947 (__v2df)(__m128d)(B), \
1948 (__v2df)_mm_setzero_pd(), \
1949 (__mmask8)-1, (int)(R)))
1951#define _mm_mask_add_round_sd(W, U, A, B, R) \
1952 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1953 (__v2df)(__m128d)(B), \
1954 (__v2df)(__m128d)(W), \
1955 (__mmask8)(U), (int)(R)))
1957#define _mm_maskz_add_round_sd(U, A, B, R) \
1958 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1959 (__v2df)(__m128d)(B), \
1960 (__v2df)_mm_setzero_pd(), \
1961 (__mmask8)(U), (int)(R)))
1965 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
1972 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
1979 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
1986 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
1991#define _mm512_add_round_pd(A, B, R) \
1992 ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
1993 (__v8df)(__m512d)(B), (int)(R)))
1995#define _mm512_mask_add_round_pd(W, U, A, B, R) \
1996 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1997 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1998 (__v8df)(__m512d)(W)))
2000#define _mm512_maskz_add_round_pd(U, A, B, R) \
2001 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2002 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
2003 (__v8df)_mm512_setzero_pd()))
2005#define _mm512_add_round_ps(A, B, R) \
2006 ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
2007 (__v16sf)(__m512)(B), (int)(R)))
2009#define _mm512_mask_add_round_ps(W, U, A, B, R) \
2010 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2011 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2012 (__v16sf)(__m512)(W)))
2014#define _mm512_maskz_add_round_ps(U, A, B, R) \
2015 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2016 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2017 (__v16sf)_mm512_setzero_ps()))
2022 return __builtin_ia32_selectss_128(__U, __A, __W);
2030#define _mm_sub_round_ss(A, B, R) \
2031 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2032 (__v4sf)(__m128)(B), \
2033 (__v4sf)_mm_setzero_ps(), \
2034 (__mmask8)-1, (int)(R)))
2036#define _mm_mask_sub_round_ss(W, U, A, B, R) \
2037 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2038 (__v4sf)(__m128)(B), \
2039 (__v4sf)(__m128)(W), (__mmask8)(U), \
2042#define _mm_maskz_sub_round_ss(U, A, B, R) \
2043 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2044 (__v4sf)(__m128)(B), \
2045 (__v4sf)_mm_setzero_ps(), \
2046 (__mmask8)(U), (int)(R)))
2051 return __builtin_ia32_selectsd_128(__U, __A, __W);
2060#define _mm_sub_round_sd(A, B, R) \
2061 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2062 (__v2df)(__m128d)(B), \
2063 (__v2df)_mm_setzero_pd(), \
2064 (__mmask8)-1, (int)(R)))
2066#define _mm_mask_sub_round_sd(W, U, A, B, R) \
2067 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2068 (__v2df)(__m128d)(B), \
2069 (__v2df)(__m128d)(W), \
2070 (__mmask8)(U), (int)(R)))
2072#define _mm_maskz_sub_round_sd(U, A, B, R) \
2073 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2074 (__v2df)(__m128d)(B), \
2075 (__v2df)_mm_setzero_pd(), \
2076 (__mmask8)(U), (int)(R)))
2080 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2087 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2094 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2101 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2106#define _mm512_sub_round_pd(A, B, R) \
2107 ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2108 (__v8df)(__m512d)(B), (int)(R)))
2110#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2111 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2112 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2113 (__v8df)(__m512d)(W)))
2115#define _mm512_maskz_sub_round_pd(U, A, B, R) \
2116 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2117 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2118 (__v8df)_mm512_setzero_pd()))
2120#define _mm512_sub_round_ps(A, B, R) \
2121 ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2122 (__v16sf)(__m512)(B), (int)(R)))
2124#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2125 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2126 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2127 (__v16sf)(__m512)(W)))
2129#define _mm512_maskz_sub_round_ps(U, A, B, R) \
2130 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2131 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2132 (__v16sf)_mm512_setzero_ps()))
2137 return __builtin_ia32_selectss_128(__U, __A, __W);
2145#define _mm_mul_round_ss(A, B, R) \
2146 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2147 (__v4sf)(__m128)(B), \
2148 (__v4sf)_mm_setzero_ps(), \
2149 (__mmask8)-1, (int)(R)))
2151#define _mm_mask_mul_round_ss(W, U, A, B, R) \
2152 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2153 (__v4sf)(__m128)(B), \
2154 (__v4sf)(__m128)(W), (__mmask8)(U), \
2157#define _mm_maskz_mul_round_ss(U, A, B, R) \
2158 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2159 (__v4sf)(__m128)(B), \
2160 (__v4sf)_mm_setzero_ps(), \
2161 (__mmask8)(U), (int)(R)))
2166 return __builtin_ia32_selectsd_128(__U, __A, __W);
2175#define _mm_mul_round_sd(A, B, R) \
2176 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2177 (__v2df)(__m128d)(B), \
2178 (__v2df)_mm_setzero_pd(), \
2179 (__mmask8)-1, (int)(R)))
2181#define _mm_mask_mul_round_sd(W, U, A, B, R) \
2182 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2183 (__v2df)(__m128d)(B), \
2184 (__v2df)(__m128d)(W), \
2185 (__mmask8)(U), (int)(R)))
2187#define _mm_maskz_mul_round_sd(U, A, B, R) \
2188 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2189 (__v2df)(__m128d)(B), \
2190 (__v2df)_mm_setzero_pd(), \
2191 (__mmask8)(U), (int)(R)))
2195 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2202 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2209 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2216 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2221#define _mm512_mul_round_pd(A, B, R) \
2222 ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2223 (__v8df)(__m512d)(B), (int)(R)))
2225#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2226 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2227 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2228 (__v8df)(__m512d)(W)))
2230#define _mm512_maskz_mul_round_pd(U, A, B, R) \
2231 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2232 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2233 (__v8df)_mm512_setzero_pd()))
2235#define _mm512_mul_round_ps(A, B, R) \
2236 ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2237 (__v16sf)(__m512)(B), (int)(R)))
2239#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2240 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2241 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2242 (__v16sf)(__m512)(W)))
2244#define _mm512_maskz_mul_round_ps(U, A, B, R) \
2245 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2246 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2247 (__v16sf)_mm512_setzero_ps()))
2252 return __builtin_ia32_selectss_128(__U, __A, __W);
2261#define _mm_div_round_ss(A, B, R) \
2262 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2263 (__v4sf)(__m128)(B), \
2264 (__v4sf)_mm_setzero_ps(), \
2265 (__mmask8)-1, (int)(R)))
2267#define _mm_mask_div_round_ss(W, U, A, B, R) \
2268 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2269 (__v4sf)(__m128)(B), \
2270 (__v4sf)(__m128)(W), (__mmask8)(U), \
2273#define _mm_maskz_div_round_ss(U, A, B, R) \
2274 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2275 (__v4sf)(__m128)(B), \
2276 (__v4sf)_mm_setzero_ps(), \
2277 (__mmask8)(U), (int)(R)))
2282 return __builtin_ia32_selectsd_128(__U, __A, __W);
2291#define _mm_div_round_sd(A, B, R) \
2292 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2293 (__v2df)(__m128d)(B), \
2294 (__v2df)_mm_setzero_pd(), \
2295 (__mmask8)-1, (int)(R)))
2297#define _mm_mask_div_round_sd(W, U, A, B, R) \
2298 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2299 (__v2df)(__m128d)(B), \
2300 (__v2df)(__m128d)(W), \
2301 (__mmask8)(U), (int)(R)))
2303#define _mm_maskz_div_round_sd(U, A, B, R) \
2304 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2305 (__v2df)(__m128d)(B), \
2306 (__v2df)_mm_setzero_pd(), \
2307 (__mmask8)(U), (int)(R)))
2312 return (__m512d)((__v8df)
__a/(__v8df)
__b);
2317 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2324 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2332 return (__m512)((__v16sf)
__a/(__v16sf)
__b);
2337 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2344 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2349#define _mm512_div_round_pd(A, B, R) \
2350 ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2351 (__v8df)(__m512d)(B), (int)(R)))
2353#define _mm512_mask_div_round_pd(W, U, A, B, R) \
2354 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2355 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2356 (__v8df)(__m512d)(W)))
2358#define _mm512_maskz_div_round_pd(U, A, B, R) \
2359 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2360 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2361 (__v8df)_mm512_setzero_pd()))
2363#define _mm512_div_round_ps(A, B, R) \
2364 ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2365 (__v16sf)(__m512)(B), (int)(R)))
2367#define _mm512_mask_div_round_ps(W, U, A, B, R) \
2368 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2369 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2370 (__v16sf)(__m512)(W)))
2372#define _mm512_maskz_div_round_ps(U, A, B, R) \
2373 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2374 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2375 (__v16sf)_mm512_setzero_ps()))
2377#define _mm512_roundscale_ps(A, B) \
2378 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2379 (__v16sf)_mm512_undefined_ps(), \
2381 _MM_FROUND_CUR_DIRECTION))
2383#define _mm512_mask_roundscale_ps(A, B, C, imm) \
2384 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2385 (__v16sf)(__m512)(A), (__mmask16)(B), \
2386 _MM_FROUND_CUR_DIRECTION))
2388#define _mm512_maskz_roundscale_ps(A, B, imm) \
2389 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2390 (__v16sf)_mm512_setzero_ps(), \
2392 _MM_FROUND_CUR_DIRECTION))
2394#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2395 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2396 (__v16sf)(__m512)(A), (__mmask16)(B), \
2399#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2400 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2401 (__v16sf)_mm512_setzero_ps(), \
2402 (__mmask16)(A), (int)(R)))
2404#define _mm512_roundscale_round_ps(A, imm, R) \
2405 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2406 (__v16sf)_mm512_undefined_ps(), \
2407 (__mmask16)-1, (int)(R)))
2409#define _mm512_roundscale_pd(A, B) \
2410 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2411 (__v8df)_mm512_undefined_pd(), \
2413 _MM_FROUND_CUR_DIRECTION))
2415#define _mm512_mask_roundscale_pd(A, B, C, imm) \
2416 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2417 (__v8df)(__m512d)(A), (__mmask8)(B), \
2418 _MM_FROUND_CUR_DIRECTION))
2420#define _mm512_maskz_roundscale_pd(A, B, imm) \
2421 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2422 (__v8df)_mm512_setzero_pd(), \
2424 _MM_FROUND_CUR_DIRECTION))
2426#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2427 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2428 (__v8df)(__m512d)(A), (__mmask8)(B), \
2431#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2432 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2433 (__v8df)_mm512_setzero_pd(), \
2434 (__mmask8)(A), (int)(R)))
2436#define _mm512_roundscale_round_pd(A, imm, R) \
2437 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2438 (__v8df)_mm512_undefined_pd(), \
2439 (__mmask8)-1, (int)(R)))
2441#define _mm512_fmadd_round_pd(A, B, C, R) \
2442 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2443 (__v8df)(__m512d)(B), \
2444 (__v8df)(__m512d)(C), \
2445 (__mmask8)-1, (int)(R)))
2448#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2449 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2450 (__v8df)(__m512d)(B), \
2451 (__v8df)(__m512d)(C), \
2452 (__mmask8)(U), (int)(R)))
2455#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2456 ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2457 (__v8df)(__m512d)(B), \
2458 (__v8df)(__m512d)(C), \
2459 (__mmask8)(U), (int)(R)))
2462#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2463 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2464 (__v8df)(__m512d)(B), \
2465 (__v8df)(__m512d)(C), \
2466 (__mmask8)(U), (int)(R)))
2469#define _mm512_fmsub_round_pd(A, B, C, R) \
2470 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2471 (__v8df)(__m512d)(B), \
2472 -(__v8df)(__m512d)(C), \
2473 (__mmask8)-1, (int)(R)))
2476#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2477 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2478 (__v8df)(__m512d)(B), \
2479 -(__v8df)(__m512d)(C), \
2480 (__mmask8)(U), (int)(R)))
2483#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2484 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2485 (__v8df)(__m512d)(B), \
2486 -(__v8df)(__m512d)(C), \
2487 (__mmask8)(U), (int)(R)))
2490#define _mm512_fnmadd_round_pd(A, B, C, R) \
2491 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2492 (__v8df)(__m512d)(B), \
2493 (__v8df)(__m512d)(C), \
2494 (__mmask8)-1, (int)(R)))
2497#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2498 ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2499 (__v8df)(__m512d)(B), \
2500 (__v8df)(__m512d)(C), \
2501 (__mmask8)(U), (int)(R)))
2504#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2505 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2506 (__v8df)(__m512d)(B), \
2507 (__v8df)(__m512d)(C), \
2508 (__mmask8)(U), (int)(R)))
2511#define _mm512_fnmsub_round_pd(A, B, C, R) \
2512 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2513 (__v8df)(__m512d)(B), \
2514 -(__v8df)(__m512d)(C), \
2515 (__mmask8)-1, (int)(R)))
2518#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2519 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2520 (__v8df)(__m512d)(B), \
2521 -(__v8df)(__m512d)(C), \
2522 (__mmask8)(U), (int)(R)))
2528 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2538 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2548 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2558 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2568 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2578 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2588 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2598 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2608 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2618 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2628 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2638 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2645#define _mm512_fmadd_round_ps(A, B, C, R) \
2646 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2647 (__v16sf)(__m512)(B), \
2648 (__v16sf)(__m512)(C), \
2649 (__mmask16)-1, (int)(R)))
2652#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2653 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2654 (__v16sf)(__m512)(B), \
2655 (__v16sf)(__m512)(C), \
2656 (__mmask16)(U), (int)(R)))
2659#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2660 ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2661 (__v16sf)(__m512)(B), \
2662 (__v16sf)(__m512)(C), \
2663 (__mmask16)(U), (int)(R)))
2666#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2667 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2668 (__v16sf)(__m512)(B), \
2669 (__v16sf)(__m512)(C), \
2670 (__mmask16)(U), (int)(R)))
2673#define _mm512_fmsub_round_ps(A, B, C, R) \
2674 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2675 (__v16sf)(__m512)(B), \
2676 -(__v16sf)(__m512)(C), \
2677 (__mmask16)-1, (int)(R)))
2680#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2681 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2682 (__v16sf)(__m512)(B), \
2683 -(__v16sf)(__m512)(C), \
2684 (__mmask16)(U), (int)(R)))
2687#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2688 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2689 (__v16sf)(__m512)(B), \
2690 -(__v16sf)(__m512)(C), \
2691 (__mmask16)(U), (int)(R)))
2694#define _mm512_fnmadd_round_ps(A, B, C, R) \
2695 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2696 -(__v16sf)(__m512)(B), \
2697 (__v16sf)(__m512)(C), \
2698 (__mmask16)-1, (int)(R)))
2701#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2702 ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2703 (__v16sf)(__m512)(B), \
2704 (__v16sf)(__m512)(C), \
2705 (__mmask16)(U), (int)(R)))
2708#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2709 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2710 (__v16sf)(__m512)(B), \
2711 (__v16sf)(__m512)(C), \
2712 (__mmask16)(U), (int)(R)))
2715#define _mm512_fnmsub_round_ps(A, B, C, R) \
2716 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2717 -(__v16sf)(__m512)(B), \
2718 -(__v16sf)(__m512)(C), \
2719 (__mmask16)-1, (int)(R)))
2722#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2723 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2724 (__v16sf)(__m512)(B), \
2725 -(__v16sf)(__m512)(C), \
2726 (__mmask16)(U), (int)(R)))
2732 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2742 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2752 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2762 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2772 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2782 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2792 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2802 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2812 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2822 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2832 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2842 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2849#define _mm512_fmaddsub_round_pd(A, B, C, R) \
2850 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2851 (__v8df)(__m512d)(B), \
2852 (__v8df)(__m512d)(C), \
2853 (__mmask8)-1, (int)(R)))
2856#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2857 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2858 (__v8df)(__m512d)(B), \
2859 (__v8df)(__m512d)(C), \
2860 (__mmask8)(U), (int)(R)))
2863#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2864 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2865 (__v8df)(__m512d)(B), \
2866 (__v8df)(__m512d)(C), \
2867 (__mmask8)(U), (int)(R)))
2870#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2871 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2872 (__v8df)(__m512d)(B), \
2873 (__v8df)(__m512d)(C), \
2874 (__mmask8)(U), (int)(R)))
2877#define _mm512_fmsubadd_round_pd(A, B, C, R) \
2878 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2879 (__v8df)(__m512d)(B), \
2880 -(__v8df)(__m512d)(C), \
2881 (__mmask8)-1, (int)(R)))
2884#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2885 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2886 (__v8df)(__m512d)(B), \
2887 -(__v8df)(__m512d)(C), \
2888 (__mmask8)(U), (int)(R)))
2891#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2892 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2893 (__v8df)(__m512d)(B), \
2894 -(__v8df)(__m512d)(C), \
2895 (__mmask8)(U), (int)(R)))
2901 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2911 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2921 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2931 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2941 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2951 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2961 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2968#define _mm512_fmaddsub_round_ps(A, B, C, R) \
2969 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2970 (__v16sf)(__m512)(B), \
2971 (__v16sf)(__m512)(C), \
2972 (__mmask16)-1, (int)(R)))
2975#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2976 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2977 (__v16sf)(__m512)(B), \
2978 (__v16sf)(__m512)(C), \
2979 (__mmask16)(U), (int)(R)))
2982#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2983 ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2984 (__v16sf)(__m512)(B), \
2985 (__v16sf)(__m512)(C), \
2986 (__mmask16)(U), (int)(R)))
2989#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2990 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
2991 (__v16sf)(__m512)(B), \
2992 (__v16sf)(__m512)(C), \
2993 (__mmask16)(U), (int)(R)))
2996#define _mm512_fmsubadd_round_ps(A, B, C, R) \
2997 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2998 (__v16sf)(__m512)(B), \
2999 -(__v16sf)(__m512)(C), \
3000 (__mmask16)-1, (int)(R)))
3003#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
3004 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
3005 (__v16sf)(__m512)(B), \
3006 -(__v16sf)(__m512)(C), \
3007 (__mmask16)(U), (int)(R)))
3010#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3011 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3012 (__v16sf)(__m512)(B), \
3013 -(__v16sf)(__m512)(C), \
3014 (__mmask16)(U), (int)(R)))
3020 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3030 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3040 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3050 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3060 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3070 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3080 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3087#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3088 ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3089 (__v8df)(__m512d)(B), \
3090 (__v8df)(__m512d)(C), \
3091 (__mmask8)(U), (int)(R)))
3097 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3104#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3105 ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3106 (__v16sf)(__m512)(B), \
3107 (__v16sf)(__m512)(C), \
3108 (__mmask16)(U), (int)(R)))
3113 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3120#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3121 ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3122 (__v8df)(__m512d)(B), \
3123 (__v8df)(__m512d)(C), \
3124 (__mmask8)(U), (int)(R)))
3130 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3137#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3138 ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3139 (__v16sf)(__m512)(B), \
3140 (__v16sf)(__m512)(C), \
3141 (__mmask16)(U), (int)(R)))
3147 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3154#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3155 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3156 -(__v8df)(__m512d)(B), \
3157 (__v8df)(__m512d)(C), \
3158 (__mmask8)(U), (int)(R)))
3164 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3171#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3172 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3173 -(__v16sf)(__m512)(B), \
3174 (__v16sf)(__m512)(C), \
3175 (__mmask16)(U), (int)(R)))
3181 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3188#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3189 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3190 -(__v8df)(__m512d)(B), \
3191 -(__v8df)(__m512d)(C), \
3192 (__mmask8)(U), (int)(R)))
3195#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3196 ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3197 (__v8df)(__m512d)(B), \
3198 (__v8df)(__m512d)(C), \
3199 (__mmask8)(U), (int)(R)))
3205 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3215 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3222#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3223 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3224 -(__v16sf)(__m512)(B), \
3225 -(__v16sf)(__m512)(C), \
3226 (__mmask16)(U), (int)(R)))
3229#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3230 ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3231 (__v16sf)(__m512)(B), \
3232 (__v16sf)(__m512)(C), \
3233 (__mmask16)(U), (int)(R)))
3239 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3249 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3263 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3271 return (__m512i)__builtin_ia32_selectd_512(__U,
3280 return (__m512i)__builtin_ia32_selectd_512(__U,
3289 return (__m512i)__builtin_ia32_selectd_512(__U,
3297 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3305 return (__m512i)__builtin_ia32_selectq_512(__U,
3314 return (__m512i)__builtin_ia32_selectq_512(__U,
3323 return (__m512i)__builtin_ia32_selectq_512(__U,
3328#define _mm512_alignr_epi64(A, B, I) \
3329 ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3330 (__v8di)(__m512i)(B), (int)(I)))
3332#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3333 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3334 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3335 (__v8di)(__m512i)(W)))
3337#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3338 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3339 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3340 (__v8di)_mm512_setzero_si512()))
3342#define _mm512_alignr_epi32(A, B, I) \
3343 ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3344 (__v16si)(__m512i)(B), (int)(I)))
3346#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3347 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3348 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3349 (__v16si)(__m512i)(W)))
3351#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3352 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3353 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3354 (__v16si)_mm512_setzero_si512()))
3357#define _mm512_extractf64x4_pd(A, I) \
3358 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3359 (__v4df)_mm256_undefined_pd(), \
3362#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3363 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3364 (__v4df)(__m256d)(W), \
3367#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3368 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3369 (__v4df)_mm256_setzero_pd(), \
3372#define _mm512_extractf32x4_ps(A, I) \
3373 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3374 (__v4sf)_mm_undefined_ps(), \
3377#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3378 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3379 (__v4sf)(__m128)(W), \
3382#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3383 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3384 (__v4sf)_mm_setzero_ps(), \
3392 return (__m512d) __builtin_ia32_selectpd_512 ((
__mmask8) __U,
3400 return (__m512) __builtin_ia32_selectps_512 ((
__mmask16) __U,
3408 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __U,
3416 return (__m512i) __builtin_ia32_selectd_512 ((
__mmask16) __U,
3423#define _mm512_cmp_round_ps_mask(A, B, P, R) \
3424 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3425 (__v16sf)(__m512)(B), (int)(P), \
3426 (__mmask16)-1, (int)(R)))
3428#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3429 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3430 (__v16sf)(__m512)(B), (int)(P), \
3431 (__mmask16)(U), (int)(R)))
3433#define _mm512_cmp_ps_mask(A, B, P) \
3434 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3435#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3436 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3438#define _mm512_cmpeq_ps_mask(A, B) \
3439 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3440#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3441 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3443#define _mm512_cmplt_ps_mask(A, B) \
3444 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3445#define _mm512_mask_cmplt_ps_mask(k, A, B) \
3446 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3448#define _mm512_cmple_ps_mask(A, B) \
3449 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3450#define _mm512_mask_cmple_ps_mask(k, A, B) \
3451 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3453#define _mm512_cmpunord_ps_mask(A, B) \
3454 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3455#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3456 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3458#define _mm512_cmpneq_ps_mask(A, B) \
3459 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3460#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3461 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3463#define _mm512_cmpnlt_ps_mask(A, B) \
3464 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3465#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3466 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3468#define _mm512_cmpnle_ps_mask(A, B) \
3469 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3470#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3471 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3473#define _mm512_cmpord_ps_mask(A, B) \
3474 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3475#define _mm512_mask_cmpord_ps_mask(k, A, B) \
3476 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3478#define _mm512_cmp_round_pd_mask(A, B, P, R) \
3479 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3480 (__v8df)(__m512d)(B), (int)(P), \
3481 (__mmask8)-1, (int)(R)))
3483#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3484 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3485 (__v8df)(__m512d)(B), (int)(P), \
3486 (__mmask8)(U), (int)(R)))
3488#define _mm512_cmp_pd_mask(A, B, P) \
3489 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3490#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3491 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3493#define _mm512_cmpeq_pd_mask(A, B) \
3494 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3495#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3496 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3498#define _mm512_cmplt_pd_mask(A, B) \
3499 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3500#define _mm512_mask_cmplt_pd_mask(k, A, B) \
3501 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3503#define _mm512_cmple_pd_mask(A, B) \
3504 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3505#define _mm512_mask_cmple_pd_mask(k, A, B) \
3506 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3508#define _mm512_cmpunord_pd_mask(A, B) \
3509 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3510#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3511 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3513#define _mm512_cmpneq_pd_mask(A, B) \
3514 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3515#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3516 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3518#define _mm512_cmpnlt_pd_mask(A, B) \
3519 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3520#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3521 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3523#define _mm512_cmpnle_pd_mask(A, B) \
3524 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3525#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3526 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3528#define _mm512_cmpord_pd_mask(A, B) \
3529 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3530#define _mm512_mask_cmpord_pd_mask(k, A, B) \
3531 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3535#define _mm512_cvtt_roundps_epu32(A, R) \
3536 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3537 (__v16si)_mm512_undefined_epi32(), \
3538 (__mmask16)-1, (int)(R)))
3540#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3541 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3542 (__v16si)(__m512i)(W), \
3543 (__mmask16)(U), (int)(R)))
3545#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3546 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3547 (__v16si)_mm512_setzero_si512(), \
3548 (__mmask16)(U), (int)(R)))
3554 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3564 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3573 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3579#define _mm512_cvt_roundepi32_ps(A, R) \
3580 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3581 (__v16sf)_mm512_setzero_ps(), \
3582 (__mmask16)-1, (int)(R)))
3584#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3585 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3586 (__v16sf)(__m512)(W), \
3587 (__mmask16)(U), (int)(R)))
3589#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3590 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3591 (__v16sf)_mm512_setzero_ps(), \
3592 (__mmask16)(U), (int)(R)))
3594#define _mm512_cvt_roundepu32_ps(A, R) \
3595 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3596 (__v16sf)_mm512_setzero_ps(), \
3597 (__mmask16)-1, (int)(R)))
3599#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3600 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3601 (__v16sf)(__m512)(W), \
3602 (__mmask16)(U), (int)(R)))
3604#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3605 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3606 (__v16sf)_mm512_setzero_ps(), \
3607 (__mmask16)(U), (int)(R)))
3612 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3618 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3626 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3634 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3640 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3648 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3668 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3674 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3682 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3690 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3696 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3704 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3721#define _mm512_cvt_roundpd_ps(A, R) \
3722 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3723 (__v8sf)_mm256_setzero_ps(), \
3724 (__mmask8)-1, (int)(R)))
3726#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3727 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3728 (__v8sf)(__m256)(W), (__mmask8)(U), \
3731#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3732 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3733 (__v8sf)_mm256_setzero_ps(), \
3734 (__mmask8)(U), (int)(R)))
3739 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3748 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3757 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3766 return (__m512) __builtin_shufflevector((__v8sf)
_mm512_cvtpd_ps(__A),
3768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3774 return (__m512) __builtin_shufflevector (
3778 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3781#define _mm512_cvt_roundps_ph(A, I) \
3782 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3783 (__v16hi)_mm256_undefined_si256(), \
3786#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3787 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3788 (__v16hi)(__m256i)(U), \
3791#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3792 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3793 (__v16hi)_mm256_setzero_si256(), \
3796#define _mm512_cvtps_ph _mm512_cvt_roundps_ph
3797#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
3798#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3800#define _mm512_cvt_roundph_ps(A, R) \
3801 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3802 (__v16sf)_mm512_undefined_ps(), \
3803 (__mmask16)-1, (int)(R)))
3805#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3806 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3807 (__v16sf)(__m512)(W), \
3808 (__mmask16)(U), (int)(R)))
3810#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3811 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3812 (__v16sf)_mm512_setzero_ps(), \
3813 (__mmask16)(U), (int)(R)))
3819 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3829 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3838 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3844#define _mm512_cvtt_roundpd_epi32(A, R) \
3845 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3846 (__v8si)_mm256_setzero_si256(), \
3847 (__mmask8)-1, (int)(R)))
3849#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3850 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3851 (__v8si)(__m256i)(W), \
3852 (__mmask8)(U), (int)(R)))
3854#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3855 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3856 (__v8si)_mm256_setzero_si256(), \
3857 (__mmask8)(U), (int)(R)))
3862 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)
__a,
3871 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3880 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3886#define _mm512_cvtt_roundps_epi32(A, R) \
3887 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3888 (__v16si)_mm512_setzero_si512(), \
3889 (__mmask16)-1, (int)(R)))
3891#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3892 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3893 (__v16si)(__m512i)(W), \
3894 (__mmask16)(U), (int)(R)))
3896#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3897 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3898 (__v16si)_mm512_setzero_si512(), \
3899 (__mmask16)(U), (int)(R)))
3905 __builtin_ia32_cvttps2dq512_mask((__v16sf)
__a,
3913 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3922 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3928#define _mm512_cvt_roundps_epi32(A, R) \
3929 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3930 (__v16si)_mm512_setzero_si512(), \
3931 (__mmask16)-1, (int)(R)))
3933#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3934 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3935 (__v16si)(__m512i)(W), \
3936 (__mmask16)(U), (int)(R)))
3938#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3939 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3940 (__v16si)_mm512_setzero_si512(), \
3941 (__mmask16)(U), (int)(R)))
3946 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3955 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3964 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3971#define _mm512_cvt_roundpd_epi32(A, R) \
3972 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3973 (__v8si)_mm256_setzero_si256(), \
3974 (__mmask8)-1, (int)(R)))
3976#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3977 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3978 (__v8si)(__m256i)(W), \
3979 (__mmask8)(U), (int)(R)))
3981#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3982 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3983 (__v8si)_mm256_setzero_si256(), \
3984 (__mmask8)(U), (int)(R)))
3989 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3999 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4008 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4015#define _mm512_cvt_roundps_epu32(A, R) \
4016 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4017 (__v16si)_mm512_setzero_si512(), \
4018 (__mmask16)-1, (int)(R)))
4020#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4021 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4022 (__v16si)(__m512i)(W), \
4023 (__mmask16)(U), (int)(R)))
4025#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4026 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4027 (__v16si)_mm512_setzero_si512(), \
4028 (__mmask16)(U), (int)(R)))
4033 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4043 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4052 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4059#define _mm512_cvt_roundpd_epu32(A, R) \
4060 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4061 (__v8si)_mm256_setzero_si256(), \
4062 (__mmask8)-1, (int)(R)))
4064#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4065 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4066 (__v8si)(__m256i)(W), \
4067 (__mmask8)(U), (int)(R)))
4069#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4070 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4071 (__v8si)_mm256_setzero_si256(), \
4072 (__mmask8)(U), (int)(R)))
4077 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4087 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4096 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4120 return (__m512d)__builtin_shufflevector((__v8df)
__a, (__v8df)
__b,
4121 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4127 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4135 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4143 return (__m512d)__builtin_shufflevector((__v8df)
__a, (__v8df)
__b,
4144 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4150 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4158 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4166 return (__m512)__builtin_shufflevector((__v16sf)
__a, (__v16sf)
__b,
4168 2+4, 18+4, 3+4, 19+4,
4169 2+8, 18+8, 3+8, 19+8,
4170 2+12, 18+12, 3+12, 19+12);
4176 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4184 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4192 return (__m512)__builtin_shufflevector((__v16sf)
__a, (__v16sf)
__b,
4194 0+4, 16+4, 1+4, 17+4,
4195 0+8, 16+8, 1+8, 17+8,
4196 0+12, 16+12, 1+12, 17+12);
4202 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4210 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4218 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4220 2+4, 18+4, 3+4, 19+4,
4221 2+8, 18+8, 3+8, 19+8,
4222 2+12, 18+12, 3+12, 19+12);
4228 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4236 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4244 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4246 0+4, 16+4, 1+4, 17+4,
4247 0+8, 16+8, 1+8, 17+8,
4248 0+12, 16+12, 1+12, 17+12);
4254 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4262 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4270 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4271 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4277 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4285 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4293 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4294 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4300 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4308 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4319 struct __loadu_si512 {
4322 return ((
const struct __loadu_si512*)
__P)->__v;
4328 struct __loadu_epi32 {
4331 return ((
const struct __loadu_epi32*)
__P)->__v;
4337 return (__m512i) __builtin_ia32_loaddqusi512_mask ((
const int *)
__P,
4346 return (__m512i) __builtin_ia32_loaddqusi512_mask ((
const int *)
__P,
4355 struct __loadu_epi64 {
4358 return ((
const struct __loadu_epi64*)
__P)->__v;
4364 return (__m512i) __builtin_ia32_loaddqudi512_mask ((
const long long *)
__P,
4372 return (__m512i) __builtin_ia32_loaddqudi512_mask ((
const long long *)
__P,
4381 return (__m512) __builtin_ia32_loadups512_mask ((
const float *)
__P,
4389 return (__m512) __builtin_ia32_loadups512_mask ((
const float *)
__P,
4398 return (__m512d) __builtin_ia32_loadupd512_mask ((
const double *)
__P,
4406 return (__m512d) __builtin_ia32_loadupd512_mask ((
const double *)
__P,
4418 return ((
const struct __loadu_pd*)
__p)->__v;
4427 return ((
const struct __loadu_ps*)
__p)->__v;
4433 return *(
const __m512*)
__p;
4439 return (__m512) __builtin_ia32_loadaps512_mask ((
const __v16sf *)
__P,
4447 return (__m512) __builtin_ia32_loadaps512_mask ((
const __v16sf *)
__P,
4456 return *(
const __m512d*)
__p;
4462 return (__m512d) __builtin_ia32_loadapd512_mask ((
const __v8df *)
__P,
4470 return (__m512d) __builtin_ia32_loadapd512_mask ((
const __v8df *)
__P,
4479 return *(
const __m512i *)
__P;
4485 return *(
const __m512i *)
__P;
4491 return *(
const __m512i *)
__P;
4499 struct __storeu_epi64 {
4502 ((
struct __storeu_epi64*)
__P)->__v = __A;
4508 __builtin_ia32_storedqudi512_mask ((
long long *)
__P, (__v8di) __A,
4515 struct __storeu_si512 {
4518 ((
struct __storeu_si512*)
__P)->__v = __A;
4524 struct __storeu_epi32 {
4527 ((
struct __storeu_epi32*)
__P)->__v = __A;
4533 __builtin_ia32_storedqusi512_mask ((
int *)
__P, (__v16si) __A,
4540 __builtin_ia32_storeupd512_mask ((
double *)
__P, (__v8df) __A, (
__mmask8) __U);
4546 struct __storeu_pd {
4549 ((
struct __storeu_pd*)
__P)->__v = __A;
4555 __builtin_ia32_storeups512_mask ((
float *)
__P, (__v16sf) __A,
4562 struct __storeu_ps {
4565 ((
struct __storeu_ps*)
__P)->__v = __A;
4571 __builtin_ia32_storeapd512_mask ((__v8df *)
__P, (__v8df) __A, (
__mmask8) __U);
4577 *(__m512d*)
__P = __A;
4583 __builtin_ia32_storeaps512_mask ((__v16sf *)
__P, (__v16sf) __A,
4590 *(__m512*)
__P = __A;
4596 *(__m512i *)
__P = __A;
4602 *(__m512i *)
__P = __A;
4608 *(__m512i *)
__P = __A;
4616 return __builtin_ia32_knothi(__M);
4621#define _mm512_cmpeq_epi32_mask(A, B) \
4622 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4623#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4624 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4625#define _mm512_cmpge_epi32_mask(A, B) \
4626 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4627#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4628 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4629#define _mm512_cmpgt_epi32_mask(A, B) \
4630 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4631#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4632 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4633#define _mm512_cmple_epi32_mask(A, B) \
4634 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4635#define _mm512_mask_cmple_epi32_mask(k, A, B) \
4636 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4637#define _mm512_cmplt_epi32_mask(A, B) \
4638 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4639#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4640 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4641#define _mm512_cmpneq_epi32_mask(A, B) \
4642 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4643#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4644 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4646#define _mm512_cmpeq_epu32_mask(A, B) \
4647 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4648#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4649 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4650#define _mm512_cmpge_epu32_mask(A, B) \
4651 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4652#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4653 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4654#define _mm512_cmpgt_epu32_mask(A, B) \
4655 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4656#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4657 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4658#define _mm512_cmple_epu32_mask(A, B) \
4659 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4660#define _mm512_mask_cmple_epu32_mask(k, A, B) \
4661 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4662#define _mm512_cmplt_epu32_mask(A, B) \
4663 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4664#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4665 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4666#define _mm512_cmpneq_epu32_mask(A, B) \
4667 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4668#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4669 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4671#define _mm512_cmpeq_epi64_mask(A, B) \
4672 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4673#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4674 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4675#define _mm512_cmpge_epi64_mask(A, B) \
4676 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4677#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4678 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4679#define _mm512_cmpgt_epi64_mask(A, B) \
4680 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4681#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4682 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4683#define _mm512_cmple_epi64_mask(A, B) \
4684 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4685#define _mm512_mask_cmple_epi64_mask(k, A, B) \
4686 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4687#define _mm512_cmplt_epi64_mask(A, B) \
4688 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4689#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4690 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4691#define _mm512_cmpneq_epi64_mask(A, B) \
4692 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4693#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4694 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4696#define _mm512_cmpeq_epu64_mask(A, B) \
4697 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4698#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4699 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4700#define _mm512_cmpge_epu64_mask(A, B) \
4701 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4702#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4703 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4704#define _mm512_cmpgt_epu64_mask(A, B) \
4705 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4706#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4707 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4708#define _mm512_cmple_epu64_mask(A, B) \
4709 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4710#define _mm512_mask_cmple_epu64_mask(k, A, B) \
4711 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4712#define _mm512_cmplt_epu64_mask(A, B) \
4713 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4714#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4715 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4716#define _mm512_cmpneq_epu64_mask(A, B) \
4717 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4718#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4719 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4726 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4732 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4740 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4750 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4756 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4764 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4772 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4778 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4786 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4794 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4800 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4808 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4816 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4822 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4830 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4838 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4844 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4852 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4860 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4866 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4874 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4882 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4888 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4896 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4904 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4910 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4918 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4926 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4932 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4940 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4948 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4954 return (__m512i)__builtin_ia32_selectd_512(__U,
4962 return (__m512i)__builtin_ia32_selectd_512(__U,
4970 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4976 return (__m512i)__builtin_ia32_selectq_512(__U,
4984 return (__m512i)__builtin_ia32_selectq_512(__U,
4991#define _mm512_cmp_epi32_mask(a, b, p) \
4992 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
4993 (__v16si)(__m512i)(b), (int)(p), \
4996#define _mm512_cmp_epu32_mask(a, b, p) \
4997 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
4998 (__v16si)(__m512i)(b), (int)(p), \
5001#define _mm512_cmp_epi64_mask(a, b, p) \
5002 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5003 (__v8di)(__m512i)(b), (int)(p), \
5006#define _mm512_cmp_epu64_mask(a, b, p) \
5007 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5008 (__v8di)(__m512i)(b), (int)(p), \
5011#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5012 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5013 (__v16si)(__m512i)(b), (int)(p), \
5016#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5017 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5018 (__v16si)(__m512i)(b), (int)(p), \
5021#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5022 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5023 (__v8di)(__m512i)(b), (int)(p), \
5026#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5027 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5028 (__v8di)(__m512i)(b), (int)(p), \
5031#define _mm512_rol_epi32(a, b) \
5032 ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5034#define _mm512_mask_rol_epi32(W, U, a, b) \
5035 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5036 (__v16si)_mm512_rol_epi32((a), (b)), \
5037 (__v16si)(__m512i)(W)))
5039#define _mm512_maskz_rol_epi32(U, a, b) \
5040 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5041 (__v16si)_mm512_rol_epi32((a), (b)), \
5042 (__v16si)_mm512_setzero_si512()))
5044#define _mm512_rol_epi64(a, b) \
5045 ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5047#define _mm512_mask_rol_epi64(W, U, a, b) \
5048 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5049 (__v8di)_mm512_rol_epi64((a), (b)), \
5050 (__v8di)(__m512i)(W)))
5052#define _mm512_maskz_rol_epi64(U, a, b) \
5053 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5054 (__v8di)_mm512_rol_epi64((a), (b)), \
5055 (__v8di)_mm512_setzero_si512()))
5060 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5066 return (__m512i)__builtin_ia32_selectd_512(__U,
5074 return (__m512i)__builtin_ia32_selectd_512(__U,
5082 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5088 return (__m512i)__builtin_ia32_selectq_512(__U,
5096 return (__m512i)__builtin_ia32_selectq_512(__U,