10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
13 #ifndef __AVX512FINTRIN_H
14 #define __AVX512FINTRIN_H
20 typedef long long __v8di
__attribute__((__vector_size__(64)));
24 typedef unsigned char __v64qu
__attribute__((__vector_size__(64)));
25 typedef unsigned short __v32hu
__attribute__((__vector_size__(64)));
26 typedef unsigned long long __v8du
__attribute__((__vector_size__(64)));
27 typedef unsigned int __v16su
__attribute__((__vector_size__(64)));
31 typedef signed char __v64qs
__attribute__((__vector_size__(64)));
33 typedef float __m512
__attribute__((__vector_size__(64), __aligned__(64)));
34 typedef double __m512d
__attribute__((__vector_size__(64), __aligned__(64)));
35 typedef long long __m512i
__attribute__((__vector_size__(64), __aligned__(64)));
37 typedef float __m512_u
__attribute__((__vector_size__(64), __aligned__(1)));
38 typedef double __m512d_u
__attribute__((__vector_size__(64), __aligned__(1)));
39 typedef long long __m512i_u
__attribute__((__vector_size__(64), __aligned__(1)));
45 #define _MM_FROUND_TO_NEAREST_INT 0x00
46 #define _MM_FROUND_TO_NEG_INF 0x01
47 #define _MM_FROUND_TO_POS_INF 0x02
48 #define _MM_FROUND_TO_ZERO 0x03
49 #define _MM_FROUND_CUR_DIRECTION 0x04
59 #define _MM_CMPINT_GE _MM_CMPINT_NLT
61 #define _MM_CMPINT_GT _MM_CMPINT_NLE
170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
171 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
172 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
179 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
182 #define _mm512_setzero_epi32 _mm512_setzero_si512
187 return (__m512d)__builtin_ia32_undef512();
193 return (__m512)__builtin_ia32_undef512();
199 return (__m512)__builtin_ia32_undef512();
205 return (__m512i)__builtin_ia32_undef512();
211 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
218 return (__m512i)__builtin_ia32_selectd_512(__M,
226 return (__m512i)__builtin_ia32_selectd_512(__M,
234 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
235 0, 0, 0, 0, 0, 0, 0, 0);
241 return (__m512i)__builtin_ia32_selectq_512(__M,
250 return (__m512i)__builtin_ia32_selectq_512(__M,
259 return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
260 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
263 #define _mm512_setzero _mm512_setzero_ps
268 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
274 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
275 __w, __w, __w, __w, __w, __w, __w, __w };
281 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
287 return __extension__ (__m512i)(__v64qi){
288 __w, __w, __w, __w, __w, __w, __w, __w,
289 __w, __w, __w, __w, __w, __w, __w, __w,
290 __w, __w, __w, __w, __w, __w, __w, __w,
291 __w, __w, __w, __w, __w, __w, __w, __w,
292 __w, __w, __w, __w, __w, __w, __w, __w,
293 __w, __w, __w, __w, __w, __w, __w, __w,
294 __w, __w, __w, __w, __w, __w, __w, __w,
295 __w, __w, __w, __w, __w, __w, __w, __w };
301 return __extension__ (__m512i)(__v32hi){
302 __w, __w, __w, __w, __w, __w, __w, __w,
303 __w, __w, __w, __w, __w, __w, __w, __w,
304 __w, __w, __w, __w, __w, __w, __w, __w,
305 __w, __w, __w, __w, __w, __w, __w, __w };
311 return __extension__ (__m512i)(__v16si){
312 __s, __s, __s, __s, __s, __s, __s, __s,
313 __s, __s, __s, __s, __s, __s, __s, __s };
319 return (__m512i)__builtin_ia32_selectd_512(__M,
327 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
333 return (__m512i)__builtin_ia32_selectq_512(__M,
341 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
348 return __extension__ (__m512i)(__v16si)
349 {
__D, __C, __B, __A,
__D, __C, __B, __A,
350 __D, __C, __B, __A,
__D, __C, __B, __A };
357 return __extension__ (__m512i) (__v8di)
358 {
__D, __C, __B, __A,
__D, __C, __B, __A };
364 return __extension__ (__m512d)
365 {
__D, __C, __B, __A,
__D, __C, __B, __A };
371 return __extension__ (__m512)
372 {
__D, __C, __B, __A,
__D, __C, __B, __A,
373 __D, __C, __B, __A,
__D, __C, __B, __A };
376 #define _mm512_setr4_epi32(e0,e1,e2,e3) \
377 _mm512_set4_epi32((e3),(e2),(e1),(e0))
379 #define _mm512_setr4_epi64(e0,e1,e2,e3) \
380 _mm512_set4_epi64((e3),(e2),(e1),(e0))
382 #define _mm512_setr4_pd(e0,e1,e2,e3) \
383 _mm512_set4_pd((e3),(e2),(e1),(e0))
385 #define _mm512_setr4_ps(e0,e1,e2,e3) \
386 _mm512_set4_ps((e3),(e2),(e1),(e0))
391 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
392 0, 0, 0, 0, 0, 0, 0, 0);
400 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, -1, -1, -1, -1);
406 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3, 4, 5, 6, 7,
407 -1, -1, -1, -1, -1, -1, -1, -1);
413 return __builtin_shufflevector(
__a,
__a, 0, 1);
419 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
425 return __builtin_shufflevector(
__a,
__a, 0, 1, 2, 3);
431 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
437 return (__m512) (__A);
443 return (__m512i) (__A);
449 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
455 return (__m512d) (__A);
461 return (__m512i) (__A);
467 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
473 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
479 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
485 return (__m512) (__A);
491 return (__m512d) (__A);
497 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
503 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
534 return __builtin_shufflevector((__v2df)
__a, (__v2df)
_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
553 return __builtin_shufflevector((__v4df)
__a, (__v4df)
_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
571 return __builtin_shufflevector((__v4sf)
__a, (__v4sf)
_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
589 return __builtin_shufflevector((__v8sf)
__a, (__v8sf)
_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
607 return __builtin_shufflevector((__v2di)
__a, (__v2di)
_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
632 return (__m512i)((__v16su)
__a & (__v16su)
__b);
638 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
653 return (__m512i)((__v8du)
__a & (__v8du)
__b);
659 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __k,
674 return (__m512i)(~(__v8du)__A & (__v8du)__B);
680 return (__m512i)(~(__v16su)__A & (__v16su)__B);
686 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
701 return (__m512i)(~(__v8du)__A & (__v8du)__B);
707 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
722 return (__m512i)((__v16su)
__a | (__v16su)
__b);
728 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
742 return (__m512i)((__v8du)
__a | (__v8du)
__b);
748 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__k,
762 return (__m512i)((__v16su)
__a ^ (__v16su)
__b);
768 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__k,
782 return (__m512i)((__v8du)
__a ^ (__v8du)
__b);
788 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__k,
802 return (__m512i)((__v8du)
__a & (__v8du)
__b);
808 return (__m512i)((__v8du)
__a | (__v8du)
__b);
814 return (__m512i)((__v8du)
__a ^ (__v8du)
__b);
822 return (__m512d)((__v8df)
__a + (__v8df)
__b);
828 return (__m512)((__v16sf)
__a + (__v16sf)
__b);
834 return (__m512d)((__v8df)
__a * (__v8df)
__b);
840 return (__m512)((__v16sf)
__a * (__v16sf)
__b);
846 return (__m512d)((__v8df)
__a - (__v8df)
__b);
852 return (__m512)((__v16sf)
__a - (__v16sf)
__b);
858 return (__m512i) ((__v8du) __A + (__v8du) __B);
864 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
872 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
880 return (__m512i) ((__v8du) __A - (__v8du) __B);
886 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
894 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
902 return (__m512i) ((__v16su) __A + (__v16su) __B);
908 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
916 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
924 return (__m512i) ((__v16su) __A - (__v16su) __B);
930 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
938 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
943 #define _mm512_max_round_pd(A, B, R) \
944 ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
945 (__v8df)(__m512d)(B), (int)(R)))
947 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
948 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
949 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
952 #define _mm512_maskz_max_round_pd(U, A, B, R) \
953 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
954 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
955 (__v8df)_mm512_setzero_pd()))
960 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
967 return (__m512d)__builtin_ia32_selectpd_512(__U,
975 return (__m512d)__builtin_ia32_selectpd_512(__U,
980 #define _mm512_max_round_ps(A, B, R) \
981 ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
982 (__v16sf)(__m512)(B), (int)(R)))
984 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
985 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
986 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
989 #define _mm512_maskz_max_round_ps(U, A, B, R) \
990 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
991 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
992 (__v16sf)_mm512_setzero_ps()))
997 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
1004 return (__m512)__builtin_ia32_selectps_512(__U,
1012 return (__m512)__builtin_ia32_selectps_512(__U,
1019 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1028 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1035 #define _mm_max_round_ss(A, B, R) \
1036 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1037 (__v4sf)(__m128)(B), \
1038 (__v4sf)_mm_setzero_ps(), \
1039 (__mmask8)-1, (int)(R)))
1041 #define _mm_mask_max_round_ss(W, U, A, B, R) \
1042 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1043 (__v4sf)(__m128)(B), \
1044 (__v4sf)(__m128)(W), (__mmask8)(U), \
1047 #define _mm_maskz_max_round_ss(U, A, B, R) \
1048 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1049 (__v4sf)(__m128)(B), \
1050 (__v4sf)_mm_setzero_ps(), \
1051 (__mmask8)(U), (int)(R)))
1055 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1064 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1071 #define _mm_max_round_sd(A, B, R) \
1072 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1073 (__v2df)(__m128d)(B), \
1074 (__v2df)_mm_setzero_pd(), \
1075 (__mmask8)-1, (int)(R)))
1077 #define _mm_mask_max_round_sd(W, U, A, B, R) \
1078 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1079 (__v2df)(__m128d)(B), \
1080 (__v2df)(__m128d)(W), \
1081 (__mmask8)(U), (int)(R)))
1083 #define _mm_maskz_max_round_sd(U, A, B, R) \
1084 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1085 (__v2df)(__m128d)(B), \
1086 (__v2df)_mm_setzero_pd(), \
1087 (__mmask8)(U), (int)(R)))
1089 static __inline __m512i
1093 return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1099 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1107 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1115 return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1121 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1129 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1137 return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1143 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1151 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1159 return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1165 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1173 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1178 #define _mm512_min_round_pd(A, B, R) \
1179 ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1180 (__v8df)(__m512d)(B), (int)(R)))
1182 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
1183 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1184 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1187 #define _mm512_maskz_min_round_pd(U, A, B, R) \
1188 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1189 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1190 (__v8df)_mm512_setzero_pd()))
1195 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1202 return (__m512d)__builtin_ia32_selectpd_512(__U,
1210 return (__m512d)__builtin_ia32_selectpd_512(__U,
1215 #define _mm512_min_round_ps(A, B, R) \
1216 ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1217 (__v16sf)(__m512)(B), (int)(R)))
1219 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
1220 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1221 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1224 #define _mm512_maskz_min_round_ps(U, A, B, R) \
1225 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1226 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1227 (__v16sf)_mm512_setzero_ps()))
1232 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1239 return (__m512)__builtin_ia32_selectps_512(__U,
1247 return (__m512)__builtin_ia32_selectps_512(__U,
1254 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1263 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1270 #define _mm_min_round_ss(A, B, R) \
1271 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1272 (__v4sf)(__m128)(B), \
1273 (__v4sf)_mm_setzero_ps(), \
1274 (__mmask8)-1, (int)(R)))
1276 #define _mm_mask_min_round_ss(W, U, A, B, R) \
1277 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1278 (__v4sf)(__m128)(B), \
1279 (__v4sf)(__m128)(W), (__mmask8)(U), \
1282 #define _mm_maskz_min_round_ss(U, A, B, R) \
1283 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1284 (__v4sf)(__m128)(B), \
1285 (__v4sf)_mm_setzero_ps(), \
1286 (__mmask8)(U), (int)(R)))
1290 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1299 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1306 #define _mm_min_round_sd(A, B, R) \
1307 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1308 (__v2df)(__m128d)(B), \
1309 (__v2df)_mm_setzero_pd(), \
1310 (__mmask8)-1, (int)(R)))
1312 #define _mm_mask_min_round_sd(W, U, A, B, R) \
1313 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1314 (__v2df)(__m128d)(B), \
1315 (__v2df)(__m128d)(W), \
1316 (__mmask8)(U), (int)(R)))
1318 #define _mm_maskz_min_round_sd(U, A, B, R) \
1319 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1320 (__v2df)(__m128d)(B), \
1321 (__v2df)_mm_setzero_pd(), \
1322 (__mmask8)(U), (int)(R)))
1324 static __inline __m512i
1328 return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1334 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1342 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1350 return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1356 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1364 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1372 return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1378 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1386 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1394 return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1400 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1408 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1416 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si)
__Y);
1422 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1430 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1438 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)
__Y);
1444 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1452 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__M,
1460 return (__m512i) ((__v16su) __A * (__v16su) __B);
1466 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1474 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__M,
1481 return (__m512i) ((__v8du) __A * (__v8du) __B);
1486 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1491 #define _mm512_sqrt_round_pd(A, R) \
1492 ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1494 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1495 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1496 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1497 (__v8df)(__m512d)(W)))
1499 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
1500 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1501 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1502 (__v8df)_mm512_setzero_pd()))
1507 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1514 return (__m512d)__builtin_ia32_selectpd_512(__U,
1522 return (__m512d)__builtin_ia32_selectpd_512(__U,
1527 #define _mm512_sqrt_round_ps(A, R) \
1528 ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1530 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1531 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1532 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1533 (__v16sf)(__m512)(W)))
1535 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
1536 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1537 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1538 (__v16sf)_mm512_setzero_ps()))
1543 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1550 return (__m512)__builtin_ia32_selectps_512(__U,
1558 return (__m512)__builtin_ia32_selectps_512(__U,
1566 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1574 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1582 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1591 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1600 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1608 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1617 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1627 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1636 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1645 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1655 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1664 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1673 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1682 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1690 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1699 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1708 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1716 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1725 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1735 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1744 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1753 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1763 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1772 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1781 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1783 (__v16sf) __A, (
unsigned short)-1,
1790 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1799 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1801 (__v8df) __A, (
unsigned char)-1,
1808 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1817 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1826 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1828 (__v16sf) __A, (
unsigned short)-1,
1835 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1837 (__v8df) __A, (
unsigned char)-1,
1844 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1853 return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1859 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1867 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
1875 return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1881 return (__m512i)__builtin_ia32_selectd_512(__U,
1889 return (__m512i)__builtin_ia32_selectd_512(__U,
1897 return __builtin_ia32_selectss_128(__U, __A, __W);
1906 #define _mm_add_round_ss(A, B, R) \
1907 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1908 (__v4sf)(__m128)(B), \
1909 (__v4sf)_mm_setzero_ps(), \
1910 (__mmask8)-1, (int)(R)))
1912 #define _mm_mask_add_round_ss(W, U, A, B, R) \
1913 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1914 (__v4sf)(__m128)(B), \
1915 (__v4sf)(__m128)(W), (__mmask8)(U), \
1918 #define _mm_maskz_add_round_ss(U, A, B, R) \
1919 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1920 (__v4sf)(__m128)(B), \
1921 (__v4sf)_mm_setzero_ps(), \
1922 (__mmask8)(U), (int)(R)))
1927 return __builtin_ia32_selectsd_128(__U, __A, __W);
1935 #define _mm_add_round_sd(A, B, R) \
1936 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1937 (__v2df)(__m128d)(B), \
1938 (__v2df)_mm_setzero_pd(), \
1939 (__mmask8)-1, (int)(R)))
1941 #define _mm_mask_add_round_sd(W, U, A, B, R) \
1942 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1943 (__v2df)(__m128d)(B), \
1944 (__v2df)(__m128d)(W), \
1945 (__mmask8)(U), (int)(R)))
1947 #define _mm_maskz_add_round_sd(U, A, B, R) \
1948 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1949 (__v2df)(__m128d)(B), \
1950 (__v2df)_mm_setzero_pd(), \
1951 (__mmask8)(U), (int)(R)))
1955 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
1962 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
1969 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
1976 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
1981 #define _mm512_add_round_pd(A, B, R) \
1982 ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
1983 (__v8df)(__m512d)(B), (int)(R)))
1985 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
1986 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1987 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1988 (__v8df)(__m512d)(W)))
1990 #define _mm512_maskz_add_round_pd(U, A, B, R) \
1991 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1992 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1993 (__v8df)_mm512_setzero_pd()))
1995 #define _mm512_add_round_ps(A, B, R) \
1996 ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
1997 (__v16sf)(__m512)(B), (int)(R)))
1999 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
2000 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2001 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2002 (__v16sf)(__m512)(W)))
2004 #define _mm512_maskz_add_round_ps(U, A, B, R) \
2005 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2006 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2007 (__v16sf)_mm512_setzero_ps()))
2012 return __builtin_ia32_selectss_128(__U, __A, __W);
2020 #define _mm_sub_round_ss(A, B, R) \
2021 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2022 (__v4sf)(__m128)(B), \
2023 (__v4sf)_mm_setzero_ps(), \
2024 (__mmask8)-1, (int)(R)))
2026 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
2027 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2028 (__v4sf)(__m128)(B), \
2029 (__v4sf)(__m128)(W), (__mmask8)(U), \
2032 #define _mm_maskz_sub_round_ss(U, A, B, R) \
2033 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2034 (__v4sf)(__m128)(B), \
2035 (__v4sf)_mm_setzero_ps(), \
2036 (__mmask8)(U), (int)(R)))
2041 return __builtin_ia32_selectsd_128(__U, __A, __W);
2050 #define _mm_sub_round_sd(A, B, R) \
2051 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2052 (__v2df)(__m128d)(B), \
2053 (__v2df)_mm_setzero_pd(), \
2054 (__mmask8)-1, (int)(R)))
2056 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
2057 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2058 (__v2df)(__m128d)(B), \
2059 (__v2df)(__m128d)(W), \
2060 (__mmask8)(U), (int)(R)))
2062 #define _mm_maskz_sub_round_sd(U, A, B, R) \
2063 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2064 (__v2df)(__m128d)(B), \
2065 (__v2df)_mm_setzero_pd(), \
2066 (__mmask8)(U), (int)(R)))
2070 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2077 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2084 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2091 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2096 #define _mm512_sub_round_pd(A, B, R) \
2097 ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2098 (__v8df)(__m512d)(B), (int)(R)))
2100 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2101 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2102 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2103 (__v8df)(__m512d)(W)))
2105 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
2106 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2107 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2108 (__v8df)_mm512_setzero_pd()))
2110 #define _mm512_sub_round_ps(A, B, R) \
2111 ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2112 (__v16sf)(__m512)(B), (int)(R)))
2114 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2115 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2116 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2117 (__v16sf)(__m512)(W)))
2119 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
2120 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2121 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2122 (__v16sf)_mm512_setzero_ps()))
2127 return __builtin_ia32_selectss_128(__U, __A, __W);
2135 #define _mm_mul_round_ss(A, B, R) \
2136 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2137 (__v4sf)(__m128)(B), \
2138 (__v4sf)_mm_setzero_ps(), \
2139 (__mmask8)-1, (int)(R)))
2141 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
2142 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2143 (__v4sf)(__m128)(B), \
2144 (__v4sf)(__m128)(W), (__mmask8)(U), \
2147 #define _mm_maskz_mul_round_ss(U, A, B, R) \
2148 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2149 (__v4sf)(__m128)(B), \
2150 (__v4sf)_mm_setzero_ps(), \
2151 (__mmask8)(U), (int)(R)))
2156 return __builtin_ia32_selectsd_128(__U, __A, __W);
2165 #define _mm_mul_round_sd(A, B, R) \
2166 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2167 (__v2df)(__m128d)(B), \
2168 (__v2df)_mm_setzero_pd(), \
2169 (__mmask8)-1, (int)(R)))
2171 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
2172 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2173 (__v2df)(__m128d)(B), \
2174 (__v2df)(__m128d)(W), \
2175 (__mmask8)(U), (int)(R)))
2177 #define _mm_maskz_mul_round_sd(U, A, B, R) \
2178 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2179 (__v2df)(__m128d)(B), \
2180 (__v2df)_mm_setzero_pd(), \
2181 (__mmask8)(U), (int)(R)))
2185 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2192 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2199 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2206 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2211 #define _mm512_mul_round_pd(A, B, R) \
2212 ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2213 (__v8df)(__m512d)(B), (int)(R)))
2215 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2216 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2217 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2218 (__v8df)(__m512d)(W)))
2220 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
2221 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2222 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2223 (__v8df)_mm512_setzero_pd()))
2225 #define _mm512_mul_round_ps(A, B, R) \
2226 ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2227 (__v16sf)(__m512)(B), (int)(R)))
2229 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2230 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2231 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2232 (__v16sf)(__m512)(W)))
2234 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
2235 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2236 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2237 (__v16sf)_mm512_setzero_ps()))
2242 return __builtin_ia32_selectss_128(__U, __A, __W);
2251 #define _mm_div_round_ss(A, B, R) \
2252 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2253 (__v4sf)(__m128)(B), \
2254 (__v4sf)_mm_setzero_ps(), \
2255 (__mmask8)-1, (int)(R)))
2257 #define _mm_mask_div_round_ss(W, U, A, B, R) \
2258 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2259 (__v4sf)(__m128)(B), \
2260 (__v4sf)(__m128)(W), (__mmask8)(U), \
2263 #define _mm_maskz_div_round_ss(U, A, B, R) \
2264 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2265 (__v4sf)(__m128)(B), \
2266 (__v4sf)_mm_setzero_ps(), \
2267 (__mmask8)(U), (int)(R)))
2272 return __builtin_ia32_selectsd_128(__U, __A, __W);
2281 #define _mm_div_round_sd(A, B, R) \
2282 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2283 (__v2df)(__m128d)(B), \
2284 (__v2df)_mm_setzero_pd(), \
2285 (__mmask8)-1, (int)(R)))
2287 #define _mm_mask_div_round_sd(W, U, A, B, R) \
2288 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2289 (__v2df)(__m128d)(B), \
2290 (__v2df)(__m128d)(W), \
2291 (__mmask8)(U), (int)(R)))
2293 #define _mm_maskz_div_round_sd(U, A, B, R) \
2294 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2295 (__v2df)(__m128d)(B), \
2296 (__v2df)_mm_setzero_pd(), \
2297 (__mmask8)(U), (int)(R)))
2302 return (__m512d)((__v8df)
__a/(__v8df)
__b);
2307 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2314 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
2322 return (__m512)((__v16sf)
__a/(__v16sf)
__b);
2327 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2334 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
2339 #define _mm512_div_round_pd(A, B, R) \
2340 ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2341 (__v8df)(__m512d)(B), (int)(R)))
2343 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
2344 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2345 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2346 (__v8df)(__m512d)(W)))
2348 #define _mm512_maskz_div_round_pd(U, A, B, R) \
2349 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2350 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2351 (__v8df)_mm512_setzero_pd()))
2353 #define _mm512_div_round_ps(A, B, R) \
2354 ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2355 (__v16sf)(__m512)(B), (int)(R)))
2357 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
2358 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2359 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2360 (__v16sf)(__m512)(W)))
2362 #define _mm512_maskz_div_round_ps(U, A, B, R) \
2363 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2364 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2365 (__v16sf)_mm512_setzero_ps()))
2367 #define _mm512_roundscale_ps(A, B) \
2368 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2369 (__v16sf)_mm512_undefined_ps(), \
2371 _MM_FROUND_CUR_DIRECTION))
2373 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
2374 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2375 (__v16sf)(__m512)(A), (__mmask16)(B), \
2376 _MM_FROUND_CUR_DIRECTION))
2378 #define _mm512_maskz_roundscale_ps(A, B, imm) \
2379 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2380 (__v16sf)_mm512_setzero_ps(), \
2382 _MM_FROUND_CUR_DIRECTION))
2384 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2385 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2386 (__v16sf)(__m512)(A), (__mmask16)(B), \
2389 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2390 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2391 (__v16sf)_mm512_setzero_ps(), \
2392 (__mmask16)(A), (int)(R)))
2394 #define _mm512_roundscale_round_ps(A, imm, R) \
2395 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2396 (__v16sf)_mm512_undefined_ps(), \
2397 (__mmask16)-1, (int)(R)))
2399 #define _mm512_roundscale_pd(A, B) \
2400 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2401 (__v8df)_mm512_undefined_pd(), \
2403 _MM_FROUND_CUR_DIRECTION))
2405 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
2406 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2407 (__v8df)(__m512d)(A), (__mmask8)(B), \
2408 _MM_FROUND_CUR_DIRECTION))
2410 #define _mm512_maskz_roundscale_pd(A, B, imm) \
2411 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2412 (__v8df)_mm512_setzero_pd(), \
2414 _MM_FROUND_CUR_DIRECTION))
2416 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2417 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2418 (__v8df)(__m512d)(A), (__mmask8)(B), \
2421 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2422 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2423 (__v8df)_mm512_setzero_pd(), \
2424 (__mmask8)(A), (int)(R)))
2426 #define _mm512_roundscale_round_pd(A, imm, R) \
2427 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2428 (__v8df)_mm512_undefined_pd(), \
2429 (__mmask8)-1, (int)(R)))
2431 #define _mm512_fmadd_round_pd(A, B, C, R) \
2432 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2433 (__v8df)(__m512d)(B), \
2434 (__v8df)(__m512d)(C), \
2435 (__mmask8)-1, (int)(R)))
2438 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2439 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2440 (__v8df)(__m512d)(B), \
2441 (__v8df)(__m512d)(C), \
2442 (__mmask8)(U), (int)(R)))
2445 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2446 ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2447 (__v8df)(__m512d)(B), \
2448 (__v8df)(__m512d)(C), \
2449 (__mmask8)(U), (int)(R)))
2452 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2453 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2454 (__v8df)(__m512d)(B), \
2455 (__v8df)(__m512d)(C), \
2456 (__mmask8)(U), (int)(R)))
2459 #define _mm512_fmsub_round_pd(A, B, C, R) \
2460 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2461 (__v8df)(__m512d)(B), \
2462 -(__v8df)(__m512d)(C), \
2463 (__mmask8)-1, (int)(R)))
2466 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2467 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2468 (__v8df)(__m512d)(B), \
2469 -(__v8df)(__m512d)(C), \
2470 (__mmask8)(U), (int)(R)))
2473 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2474 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2475 (__v8df)(__m512d)(B), \
2476 -(__v8df)(__m512d)(C), \
2477 (__mmask8)(U), (int)(R)))
2480 #define _mm512_fnmadd_round_pd(A, B, C, R) \
2481 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2482 (__v8df)(__m512d)(B), \
2483 (__v8df)(__m512d)(C), \
2484 (__mmask8)-1, (int)(R)))
2487 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2488 ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2489 (__v8df)(__m512d)(B), \
2490 (__v8df)(__m512d)(C), \
2491 (__mmask8)(U), (int)(R)))
2494 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2495 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2496 (__v8df)(__m512d)(B), \
2497 (__v8df)(__m512d)(C), \
2498 (__mmask8)(U), (int)(R)))
2501 #define _mm512_fnmsub_round_pd(A, B, C, R) \
2502 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2503 (__v8df)(__m512d)(B), \
2504 -(__v8df)(__m512d)(C), \
2505 (__mmask8)-1, (int)(R)))
2508 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2509 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2510 (__v8df)(__m512d)(B), \
2511 -(__v8df)(__m512d)(C), \
2512 (__mmask8)(U), (int)(R)))
2518 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2528 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2538 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2548 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2558 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2568 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2578 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2588 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2598 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2608 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2618 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2628 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2635 #define _mm512_fmadd_round_ps(A, B, C, R) \
2636 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2637 (__v16sf)(__m512)(B), \
2638 (__v16sf)(__m512)(C), \
2639 (__mmask16)-1, (int)(R)))
2642 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2643 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2644 (__v16sf)(__m512)(B), \
2645 (__v16sf)(__m512)(C), \
2646 (__mmask16)(U), (int)(R)))
2649 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2650 ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2651 (__v16sf)(__m512)(B), \
2652 (__v16sf)(__m512)(C), \
2653 (__mmask16)(U), (int)(R)))
2656 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2657 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2658 (__v16sf)(__m512)(B), \
2659 (__v16sf)(__m512)(C), \
2660 (__mmask16)(U), (int)(R)))
2663 #define _mm512_fmsub_round_ps(A, B, C, R) \
2664 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2665 (__v16sf)(__m512)(B), \
2666 -(__v16sf)(__m512)(C), \
2667 (__mmask16)-1, (int)(R)))
2670 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2671 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2672 (__v16sf)(__m512)(B), \
2673 -(__v16sf)(__m512)(C), \
2674 (__mmask16)(U), (int)(R)))
2677 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2678 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2679 (__v16sf)(__m512)(B), \
2680 -(__v16sf)(__m512)(C), \
2681 (__mmask16)(U), (int)(R)))
2684 #define _mm512_fnmadd_round_ps(A, B, C, R) \
2685 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2686 -(__v16sf)(__m512)(B), \
2687 (__v16sf)(__m512)(C), \
2688 (__mmask16)-1, (int)(R)))
2691 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2692 ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2693 (__v16sf)(__m512)(B), \
2694 (__v16sf)(__m512)(C), \
2695 (__mmask16)(U), (int)(R)))
2698 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2699 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2700 (__v16sf)(__m512)(B), \
2701 (__v16sf)(__m512)(C), \
2702 (__mmask16)(U), (int)(R)))
2705 #define _mm512_fnmsub_round_ps(A, B, C, R) \
2706 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2707 -(__v16sf)(__m512)(B), \
2708 -(__v16sf)(__m512)(C), \
2709 (__mmask16)-1, (int)(R)))
2712 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2713 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2714 (__v16sf)(__m512)(B), \
2715 -(__v16sf)(__m512)(C), \
2716 (__mmask16)(U), (int)(R)))
2722 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2732 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2742 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2752 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2762 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2772 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2782 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2792 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2802 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2812 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2822 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2832 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2839 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
2840 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2841 (__v8df)(__m512d)(B), \
2842 (__v8df)(__m512d)(C), \
2843 (__mmask8)-1, (int)(R)))
2846 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2847 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2848 (__v8df)(__m512d)(B), \
2849 (__v8df)(__m512d)(C), \
2850 (__mmask8)(U), (int)(R)))
2853 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2854 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2855 (__v8df)(__m512d)(B), \
2856 (__v8df)(__m512d)(C), \
2857 (__mmask8)(U), (int)(R)))
2860 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2861 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2862 (__v8df)(__m512d)(B), \
2863 (__v8df)(__m512d)(C), \
2864 (__mmask8)(U), (int)(R)))
2867 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
2868 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2869 (__v8df)(__m512d)(B), \
2870 -(__v8df)(__m512d)(C), \
2871 (__mmask8)-1, (int)(R)))
2874 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2875 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2876 (__v8df)(__m512d)(B), \
2877 -(__v8df)(__m512d)(C), \
2878 (__mmask8)(U), (int)(R)))
2881 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2882 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2883 (__v8df)(__m512d)(B), \
2884 -(__v8df)(__m512d)(C), \
2885 (__mmask8)(U), (int)(R)))
2891 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2901 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2911 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2921 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2931 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2941 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2951 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2958 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
2959 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2960 (__v16sf)(__m512)(B), \
2961 (__v16sf)(__m512)(C), \
2962 (__mmask16)-1, (int)(R)))
2965 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2966 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2967 (__v16sf)(__m512)(B), \
2968 (__v16sf)(__m512)(C), \
2969 (__mmask16)(U), (int)(R)))
2972 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2973 ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2974 (__v16sf)(__m512)(B), \
2975 (__v16sf)(__m512)(C), \
2976 (__mmask16)(U), (int)(R)))
2979 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2980 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
2981 (__v16sf)(__m512)(B), \
2982 (__v16sf)(__m512)(C), \
2983 (__mmask16)(U), (int)(R)))
2986 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
2987 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2988 (__v16sf)(__m512)(B), \
2989 -(__v16sf)(__m512)(C), \
2990 (__mmask16)-1, (int)(R)))
2993 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
2994 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2995 (__v16sf)(__m512)(B), \
2996 -(__v16sf)(__m512)(C), \
2997 (__mmask16)(U), (int)(R)))
3000 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3001 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3002 (__v16sf)(__m512)(B), \
3003 -(__v16sf)(__m512)(C), \
3004 (__mmask16)(U), (int)(R)))
3010 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3020 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3030 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3040 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3050 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3060 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3070 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3077 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3078 ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3079 (__v8df)(__m512d)(B), \
3080 (__v8df)(__m512d)(C), \
3081 (__mmask8)(U), (int)(R)))
3087 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3094 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3095 ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3096 (__v16sf)(__m512)(B), \
3097 (__v16sf)(__m512)(C), \
3098 (__mmask16)(U), (int)(R)))
3103 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3110 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3111 ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3112 (__v8df)(__m512d)(B), \
3113 (__v8df)(__m512d)(C), \
3114 (__mmask8)(U), (int)(R)))
3120 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3127 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3128 ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3129 (__v16sf)(__m512)(B), \
3130 (__v16sf)(__m512)(C), \
3131 (__mmask16)(U), (int)(R)))
3137 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3144 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3145 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3146 -(__v8df)(__m512d)(B), \
3147 (__v8df)(__m512d)(C), \
3148 (__mmask8)(U), (int)(R)))
3154 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3161 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3162 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3163 -(__v16sf)(__m512)(B), \
3164 (__v16sf)(__m512)(C), \
3165 (__mmask16)(U), (int)(R)))
3171 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3178 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3179 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3180 -(__v8df)(__m512d)(B), \
3181 -(__v8df)(__m512d)(C), \
3182 (__mmask8)(U), (int)(R)))
3185 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3186 ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3187 (__v8df)(__m512d)(B), \
3188 (__v8df)(__m512d)(C), \
3189 (__mmask8)(U), (int)(R)))
3195 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3205 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3212 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3213 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3214 -(__v16sf)(__m512)(B), \
3215 -(__v16sf)(__m512)(C), \
3216 (__mmask16)(U), (int)(R)))
3219 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3220 ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3221 (__v16sf)(__m512)(B), \
3222 (__v16sf)(__m512)(C), \
3223 (__mmask16)(U), (int)(R)))
3229 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3239 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3253 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3261 return (__m512i)__builtin_ia32_selectd_512(__U,
3270 return (__m512i)__builtin_ia32_selectd_512(__U,
3279 return (__m512i)__builtin_ia32_selectd_512(__U,
3287 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3295 return (__m512i)__builtin_ia32_selectq_512(__U,
3304 return (__m512i)__builtin_ia32_selectq_512(__U,
3313 return (__m512i)__builtin_ia32_selectq_512(__U,
3318 #define _mm512_alignr_epi64(A, B, I) \
3319 ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3320 (__v8di)(__m512i)(B), (int)(I)))
3322 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3323 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3324 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3325 (__v8di)(__m512i)(W)))
3327 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3328 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3329 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3330 (__v8di)_mm512_setzero_si512()))
3332 #define _mm512_alignr_epi32(A, B, I) \
3333 ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3334 (__v16si)(__m512i)(B), (int)(I)))
3336 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3337 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3338 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3339 (__v16si)(__m512i)(W)))
3341 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3342 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3343 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3344 (__v16si)_mm512_setzero_si512()))
3347 #define _mm512_extractf64x4_pd(A, I) \
3348 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3349 (__v4df)_mm256_undefined_pd(), \
3352 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3353 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3354 (__v4df)(__m256d)(W), \
3357 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3358 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3359 (__v4df)_mm256_setzero_pd(), \
3362 #define _mm512_extractf32x4_ps(A, I) \
3363 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3364 (__v4sf)_mm_undefined_ps(), \
3367 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3368 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3369 (__v4sf)(__m128)(W), \
3372 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3373 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3374 (__v4sf)_mm_setzero_ps(), \
3382 return (__m512d) __builtin_ia32_selectpd_512 ((
__mmask8) __U,
3390 return (__m512) __builtin_ia32_selectps_512 ((
__mmask16) __U,
3398 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __U,
3406 return (__m512i) __builtin_ia32_selectd_512 ((
__mmask16) __U,
3413 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
3414 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3415 (__v16sf)(__m512)(B), (int)(P), \
3416 (__mmask16)-1, (int)(R)))
3418 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3419 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3420 (__v16sf)(__m512)(B), (int)(P), \
3421 (__mmask16)(U), (int)(R)))
3423 #define _mm512_cmp_ps_mask(A, B, P) \
3424 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3425 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3426 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3428 #define _mm512_cmpeq_ps_mask(A, B) \
3429 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3430 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3431 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3433 #define _mm512_cmplt_ps_mask(A, B) \
3434 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3435 #define _mm512_mask_cmplt_ps_mask(k, A, B) \
3436 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3438 #define _mm512_cmple_ps_mask(A, B) \
3439 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3440 #define _mm512_mask_cmple_ps_mask(k, A, B) \
3441 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3443 #define _mm512_cmpunord_ps_mask(A, B) \
3444 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3445 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3446 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3448 #define _mm512_cmpneq_ps_mask(A, B) \
3449 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3450 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3451 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3453 #define _mm512_cmpnlt_ps_mask(A, B) \
3454 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3455 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3456 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3458 #define _mm512_cmpnle_ps_mask(A, B) \
3459 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3460 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3461 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3463 #define _mm512_cmpord_ps_mask(A, B) \
3464 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3465 #define _mm512_mask_cmpord_ps_mask(k, A, B) \
3466 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3468 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
3469 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3470 (__v8df)(__m512d)(B), (int)(P), \
3471 (__mmask8)-1, (int)(R)))
3473 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3474 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3475 (__v8df)(__m512d)(B), (int)(P), \
3476 (__mmask8)(U), (int)(R)))
3478 #define _mm512_cmp_pd_mask(A, B, P) \
3479 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3480 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3481 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3483 #define _mm512_cmpeq_pd_mask(A, B) \
3484 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3485 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3486 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3488 #define _mm512_cmplt_pd_mask(A, B) \
3489 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3490 #define _mm512_mask_cmplt_pd_mask(k, A, B) \
3491 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3493 #define _mm512_cmple_pd_mask(A, B) \
3494 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3495 #define _mm512_mask_cmple_pd_mask(k, A, B) \
3496 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3498 #define _mm512_cmpunord_pd_mask(A, B) \
3499 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3500 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3501 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3503 #define _mm512_cmpneq_pd_mask(A, B) \
3504 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3505 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3506 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3508 #define _mm512_cmpnlt_pd_mask(A, B) \
3509 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3510 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3511 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3513 #define _mm512_cmpnle_pd_mask(A, B) \
3514 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3515 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3516 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3518 #define _mm512_cmpord_pd_mask(A, B) \
3519 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3520 #define _mm512_mask_cmpord_pd_mask(k, A, B) \
3521 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3525 #define _mm512_cvtt_roundps_epu32(A, R) \
3526 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3527 (__v16si)_mm512_undefined_epi32(), \
3528 (__mmask16)-1, (int)(R)))
3530 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3531 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3532 (__v16si)(__m512i)(W), \
3533 (__mmask16)(U), (int)(R)))
3535 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3536 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3537 (__v16si)_mm512_setzero_si512(), \
3538 (__mmask16)(U), (int)(R)))
3544 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3554 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3563 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3569 #define _mm512_cvt_roundepi32_ps(A, R) \
3570 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3571 (__v16sf)_mm512_setzero_ps(), \
3572 (__mmask16)-1, (int)(R)))
3574 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3575 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3576 (__v16sf)(__m512)(W), \
3577 (__mmask16)(U), (int)(R)))
3579 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3580 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3581 (__v16sf)_mm512_setzero_ps(), \
3582 (__mmask16)(U), (int)(R)))
3584 #define _mm512_cvt_roundepu32_ps(A, R) \
3585 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3586 (__v16sf)_mm512_setzero_ps(), \
3587 (__mmask16)-1, (int)(R)))
3589 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3590 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3591 (__v16sf)(__m512)(W), \
3592 (__mmask16)(U), (int)(R)))
3594 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3595 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3596 (__v16sf)_mm512_setzero_ps(), \
3597 (__mmask16)(U), (int)(R)))
3602 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3608 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3616 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3624 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3630 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3638 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3658 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3664 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3672 return (__m512)__builtin_ia32_selectps_512((
__mmask16)__U,
3680 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3686 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3694 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
3711 #define _mm512_cvt_roundpd_ps(A, R) \
3712 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3713 (__v8sf)_mm256_setzero_ps(), \
3714 (__mmask8)-1, (int)(R)))
3716 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3717 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3718 (__v8sf)(__m256)(W), (__mmask8)(U), \
3721 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3722 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3723 (__v8sf)_mm256_setzero_ps(), \
3724 (__mmask8)(U), (int)(R)))
3729 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3738 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3747 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3756 return (__m512) __builtin_shufflevector((__v8sf)
_mm512_cvtpd_ps(__A),
3758 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3764 return (__m512) __builtin_shufflevector (
3768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3771 #define _mm512_cvt_roundps_ph(A, I) \
3772 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3773 (__v16hi)_mm256_undefined_si256(), \
3776 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3777 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3778 (__v16hi)(__m256i)(U), \
3781 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3782 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3783 (__v16hi)_mm256_setzero_si256(), \
3786 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph
3787 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
3788 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3790 #define _mm512_cvt_roundph_ps(A, R) \
3791 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3792 (__v16sf)_mm512_undefined_ps(), \
3793 (__mmask16)-1, (int)(R)))
3795 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3796 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3797 (__v16sf)(__m512)(W), \
3798 (__mmask16)(U), (int)(R)))
3800 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3801 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3802 (__v16sf)_mm512_setzero_ps(), \
3803 (__mmask16)(U), (int)(R)))
3809 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3819 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3828 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3834 #define _mm512_cvtt_roundpd_epi32(A, R) \
3835 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3836 (__v8si)_mm256_setzero_si256(), \
3837 (__mmask8)-1, (int)(R)))
3839 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3840 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3841 (__v8si)(__m256i)(W), \
3842 (__mmask8)(U), (int)(R)))
3844 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3845 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3846 (__v8si)_mm256_setzero_si256(), \
3847 (__mmask8)(U), (int)(R)))
3852 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)
__a,
3861 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3870 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3876 #define _mm512_cvtt_roundps_epi32(A, R) \
3877 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3878 (__v16si)_mm512_setzero_si512(), \
3879 (__mmask16)-1, (int)(R)))
3881 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3882 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3883 (__v16si)(__m512i)(W), \
3884 (__mmask16)(U), (int)(R)))
3886 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3887 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3888 (__v16si)_mm512_setzero_si512(), \
3889 (__mmask16)(U), (int)(R)))
3895 __builtin_ia32_cvttps2dq512_mask((__v16sf)
__a,
3903 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3912 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3918 #define _mm512_cvt_roundps_epi32(A, R) \
3919 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3920 (__v16si)_mm512_setzero_si512(), \
3921 (__mmask16)-1, (int)(R)))
3923 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3924 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3925 (__v16si)(__m512i)(W), \
3926 (__mmask16)(U), (int)(R)))
3928 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3929 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3930 (__v16si)_mm512_setzero_si512(), \
3931 (__mmask16)(U), (int)(R)))
3936 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3945 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3954 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3961 #define _mm512_cvt_roundpd_epi32(A, R) \
3962 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3963 (__v8si)_mm256_setzero_si256(), \
3964 (__mmask8)-1, (int)(R)))
3966 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3967 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3968 (__v8si)(__m256i)(W), \
3969 (__mmask8)(U), (int)(R)))
3971 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3972 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3973 (__v8si)_mm256_setzero_si256(), \
3974 (__mmask8)(U), (int)(R)))
3979 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3989 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3998 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4005 #define _mm512_cvt_roundps_epu32(A, R) \
4006 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4007 (__v16si)_mm512_setzero_si512(), \
4008 (__mmask16)-1, (int)(R)))
4010 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4011 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4012 (__v16si)(__m512i)(W), \
4013 (__mmask16)(U), (int)(R)))
4015 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4016 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4017 (__v16si)_mm512_setzero_si512(), \
4018 (__mmask16)(U), (int)(R)))
4023 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4033 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4042 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4049 #define _mm512_cvt_roundpd_epu32(A, R) \
4050 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4051 (__v8si)_mm256_setzero_si256(), \
4052 (__mmask8)-1, (int)(R)))
4054 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4055 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4056 (__v8si)(__m256i)(W), \
4057 (__mmask8)(U), (int)(R)))
4059 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4060 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4061 (__v8si)_mm256_setzero_si256(), \
4062 (__mmask8)(U), (int)(R)))
4067 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4077 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4086 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4110 return (__m512d)__builtin_shufflevector((__v8df)
__a, (__v8df)
__b,
4111 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4117 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4125 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4133 return (__m512d)__builtin_shufflevector((__v8df)
__a, (__v8df)
__b,
4134 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4140 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4148 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8) __U,
4156 return (__m512)__builtin_shufflevector((__v16sf)
__a, (__v16sf)
__b,
4158 2+4, 18+4, 3+4, 19+4,
4159 2+8, 18+8, 3+8, 19+8,
4160 2+12, 18+12, 3+12, 19+12);
4166 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4174 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4182 return (__m512)__builtin_shufflevector((__v16sf)
__a, (__v16sf)
__b,
4184 0+4, 16+4, 1+4, 17+4,
4185 0+8, 16+8, 1+8, 17+8,
4186 0+12, 16+12, 1+12, 17+12);
4192 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4200 return (__m512)__builtin_ia32_selectps_512((
__mmask16) __U,
4208 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4210 2+4, 18+4, 3+4, 19+4,
4211 2+8, 18+8, 3+8, 19+8,
4212 2+12, 18+12, 3+12, 19+12);
4218 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4226 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4234 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4236 0+4, 16+4, 1+4, 17+4,
4237 0+8, 16+8, 1+8, 17+8,
4238 0+12, 16+12, 1+12, 17+12);
4244 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4252 return (__m512i)__builtin_ia32_selectd_512((
__mmask16) __U,
4260 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4261 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4267 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4275 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4283 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4284 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4290 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4298 return (__m512i)__builtin_ia32_selectq_512((
__mmask8) __U,
4309 struct __loadu_si512 {
4312 return ((
const struct __loadu_si512*)
__P)->__v;
4318 struct __loadu_epi32 {
4321 return ((
const struct __loadu_epi32*)
__P)->__v;
4327 return (__m512i) __builtin_ia32_loaddqusi512_mask ((
const int *)
__P,
4336 return (__m512i) __builtin_ia32_loaddqusi512_mask ((
const int *)
__P,
4345 struct __loadu_epi64 {
4348 return ((
const struct __loadu_epi64*)
__P)->__v;
4354 return (__m512i) __builtin_ia32_loaddqudi512_mask ((
const long long *)
__P,
4362 return (__m512i) __builtin_ia32_loaddqudi512_mask ((
const long long *)
__P,
4371 return (__m512) __builtin_ia32_loadups512_mask ((
const float *)
__P,
4379 return (__m512) __builtin_ia32_loadups512_mask ((
const float *)
__P,
4388 return (__m512d) __builtin_ia32_loadupd512_mask ((
const double *)
__P,
4396 return (__m512d) __builtin_ia32_loadupd512_mask ((
const double *)
__P,
4408 return ((
const struct __loadu_pd*)
__p)->__v;
4417 return ((
const struct __loadu_ps*)
__p)->__v;
4423 return *(
const __m512*)
__p;
4429 return (__m512) __builtin_ia32_loadaps512_mask ((
const __v16sf *)
__P,
4437 return (__m512) __builtin_ia32_loadaps512_mask ((
const __v16sf *)
__P,
4446 return *(
const __m512d*)
__p;
4452 return (__m512d) __builtin_ia32_loadapd512_mask ((
const __v8df *)
__P,
4460 return (__m512d) __builtin_ia32_loadapd512_mask ((
const __v8df *)
__P,
4469 return *(
const __m512i *)
__P;
4475 return *(
const __m512i *)
__P;
4481 return *(
const __m512i *)
__P;
4489 struct __storeu_epi64 {
4492 ((
struct __storeu_epi64*)
__P)->__v = __A;
4498 __builtin_ia32_storedqudi512_mask ((
long long *)
__P, (__v8di) __A,
4505 struct __storeu_si512 {
4508 ((
struct __storeu_si512*)
__P)->__v = __A;
4514 struct __storeu_epi32 {
4517 ((
struct __storeu_epi32*)
__P)->__v = __A;
4523 __builtin_ia32_storedqusi512_mask ((
int *)
__P, (__v16si) __A,
4530 __builtin_ia32_storeupd512_mask ((
double *)
__P, (__v8df) __A, (
__mmask8) __U);
4536 struct __storeu_pd {
4539 ((
struct __storeu_pd*)
__P)->__v = __A;
4545 __builtin_ia32_storeups512_mask ((
float *)
__P, (__v16sf) __A,
4552 struct __storeu_ps {
4555 ((
struct __storeu_ps*)
__P)->__v = __A;
4561 __builtin_ia32_storeapd512_mask ((__v8df *)
__P, (__v8df) __A, (
__mmask8) __U);
4567 *(__m512d*)
__P = __A;
4573 __builtin_ia32_storeaps512_mask ((__v16sf *)
__P, (__v16sf) __A,
4580 *(__m512*)
__P = __A;
4586 *(__m512i *)
__P = __A;
4592 *(__m512i *)
__P = __A;
4598 *(__m512i *)
__P = __A;
4606 return __builtin_ia32_knothi(__M);
4611 #define _mm512_cmpeq_epi32_mask(A, B) \
4612 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4613 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4614 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4615 #define _mm512_cmpge_epi32_mask(A, B) \
4616 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4617 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4618 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4619 #define _mm512_cmpgt_epi32_mask(A, B) \
4620 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4621 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4622 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4623 #define _mm512_cmple_epi32_mask(A, B) \
4624 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4625 #define _mm512_mask_cmple_epi32_mask(k, A, B) \
4626 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4627 #define _mm512_cmplt_epi32_mask(A, B) \
4628 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4629 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4630 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4631 #define _mm512_cmpneq_epi32_mask(A, B) \
4632 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4633 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4634 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4636 #define _mm512_cmpeq_epu32_mask(A, B) \
4637 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4638 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4639 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4640 #define _mm512_cmpge_epu32_mask(A, B) \
4641 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4642 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4643 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4644 #define _mm512_cmpgt_epu32_mask(A, B) \
4645 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4646 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4647 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4648 #define _mm512_cmple_epu32_mask(A, B) \
4649 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4650 #define _mm512_mask_cmple_epu32_mask(k, A, B) \
4651 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4652 #define _mm512_cmplt_epu32_mask(A, B) \
4653 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4654 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4655 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4656 #define _mm512_cmpneq_epu32_mask(A, B) \
4657 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4658 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4659 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4661 #define _mm512_cmpeq_epi64_mask(A, B) \
4662 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4663 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4664 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4665 #define _mm512_cmpge_epi64_mask(A, B) \
4666 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4667 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4668 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4669 #define _mm512_cmpgt_epi64_mask(A, B) \
4670 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4671 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4672 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4673 #define _mm512_cmple_epi64_mask(A, B) \
4674 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4675 #define _mm512_mask_cmple_epi64_mask(k, A, B) \
4676 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4677 #define _mm512_cmplt_epi64_mask(A, B) \
4678 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4679 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4680 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4681 #define _mm512_cmpneq_epi64_mask(A, B) \
4682 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4683 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4684 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4686 #define _mm512_cmpeq_epu64_mask(A, B) \
4687 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4688 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4689 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4690 #define _mm512_cmpge_epu64_mask(A, B) \
4691 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4692 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4693 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4694 #define _mm512_cmpgt_epu64_mask(A, B) \
4695 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4696 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4697 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4698 #define _mm512_cmple_epu64_mask(A, B) \
4699 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4700 #define _mm512_mask_cmple_epu64_mask(k, A, B) \
4701 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4702 #define _mm512_cmplt_epu64_mask(A, B) \
4703 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4704 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4705 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4706 #define _mm512_cmpneq_epu64_mask(A, B) \
4707 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4708 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4709 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4716 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4722 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4730 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4740 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4746 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4754 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4762 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4768 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4776 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4784 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4790 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4798 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4806 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4812 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4820 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4828 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4834 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4842 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4850 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4856 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4864 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4872 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4878 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4886 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4894 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4900 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4908 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
4916 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4922 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4930 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
4938 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4944 return (__m512i)__builtin_ia32_selectd_512(__U,
4952 return (__m512i)__builtin_ia32_selectd_512(__U,
4960 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4966 return (__m512i)__builtin_ia32_selectq_512(__U,
4974 return (__m512i)__builtin_ia32_selectq_512(__U,
4981 #define _mm512_cmp_epi32_mask(a, b, p) \
4982 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
4983 (__v16si)(__m512i)(b), (int)(p), \
4986 #define _mm512_cmp_epu32_mask(a, b, p) \
4987 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
4988 (__v16si)(__m512i)(b), (int)(p), \
4991 #define _mm512_cmp_epi64_mask(a, b, p) \
4992 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
4993 (__v8di)(__m512i)(b), (int)(p), \
4996 #define _mm512_cmp_epu64_mask(a, b, p) \
4997 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
4998 (__v8di)(__m512i)(b), (int)(p), \
5001 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5002 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5003 (__v16si)(__m512i)(b), (int)(p), \
5006 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5007 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5008 (__v16si)(__m512i)(b), (int)(p), \
5011 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5012 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5013 (__v8di)(__m512i)(b), (int)(p), \
5016 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5017 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5018 (__v8di)(__m512i)(b), (int)(p), \
5021 #define _mm512_rol_epi32(a, b) \
5022 ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5024 #define _mm512_mask_rol_epi32(W, U, a, b) \
5025 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5026 (__v16si)_mm512_rol_epi32((a), (b)), \
5027 (__v16si)(__m512i)(W)))
5029 #define _mm512_maskz_rol_epi32(U, a, b) \
5030 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5031 (__v16si)_mm512_rol_epi32((a), (b)), \
5032 (__v16si)_mm512_setzero_si512()))
5034 #define _mm512_rol_epi64(a, b) \
5035 ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5037 #define _mm512_mask_rol_epi64(W, U, a, b) \
5038 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5039 (__v8di)_mm512_rol_epi64((a), (b)), \
5040 (__v8di)(__m512i)(W)))
5042 #define _mm512_maskz_rol_epi64(U, a, b) \
5043 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5044 (__v8di)_mm512_rol_epi64((a), (b)), \
5045 (__v8di)_mm512_setzero_si512()))
5050 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5056 return (__m512i)__builtin_ia32_selectd_512(__U,
5064 return (__m512i)__builtin_ia32_selectd_512(__U,
5072 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5078 return (__m512i)__builtin_ia32_selectq_512(__U,
5086 return (__m512i)__builtin_ia32_selectq_512(__U,
5091 #define _mm512_ror_epi32(A, B) \
5092 ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
5094 #define _mm512_mask_ror_epi32(W, U, A, B) \
5095 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5096 (__v16si)_mm512_ror_epi32((A), (B)), \
5097 (__v16si)(__m512i)(W)))
5099 #define _mm512_maskz_ror_epi32(U, A, B) \
5100 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5101 (__v16si)_mm512_ror_epi32((A), (B)), \
5102 (__v16si)_mm512_setzero_si512()))
5104 #define _mm512_ror_epi64(A, B) \
5105 ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
5107 #define _mm512_mask_ror_epi64(W, U, A, B) \
5108 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5109 (__v8di)_mm512_ror_epi64((A), (B)), \
5110 (__v8di)(__m512i)(W)))
5112 #define _mm512_maskz_ror_epi64(U, A, B) \
5113 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5114 (__v8di)_mm512_ror_epi64((A), (B)), \
5115 (__v8di)_mm512_setzero_si512()))
5120 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (
int)__B);
5127 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
5134 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
5142 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (
int)__B);
5148 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
5156 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
5164 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (
int)__B);
5171 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
5178 return (__m512i)__builtin_ia32_selectd_512((
__mmask16)__U,
5186 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (
int)__B);
5193 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
5202 return (__m512i)__builtin_ia32_selectq_512((
__mmask8)__U,
5210 return (__m512i) __builtin_ia32_movdqa32load512_mask ((
const __v16si *)
__P,
5218 return (__m512i) __builtin_ia32_movdqa32load512_mask ((
const __v16si *)
__P,
5227 __builtin_ia32_movdqa32store512_mask ((__v16si *)
__P, (__v16si) __A,
5234 return (__m512i) __builtin_ia32_selectd_512 ((
__mmask16) __U,
5242 return (__m512i) __builtin_ia32_selectd_512 ((
__mmask16) __U,
5250 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __U,
5258 return (__m512i) __builtin_ia32_selectq_512 ((
__mmask8) __U,
5266 return (__m512i) __builtin_ia32_movdqa64load512_mask ((
const __v8di *)
__P,
5274 return (__m512i) __builtin_ia32_movdqa64load512_mask ((
const __v8di *)
__P,
5283 __builtin_ia32_movdqa64store512_mask ((__v8di *)
__P, (__v8di) __A,
5290 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
5291 0, 0, 2, 2, 4, 4, 6, 6);
5297 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
5305 return (__m512d)__builtin_ia32_selectpd_512((
__mmask8)__U,
5310 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
5311 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5312 (__v8df)(__m512d)(B), \
5313 (__v8di)(__m512i)(C), (int)(imm), \
5314 (__mmask8)-1, (int)(R)))
5316 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
5317 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5318 (__v8df)(__m512d)(B), \
5319 (__v8di)(__m512i)(C), (int)(imm), \
5320 (__mmask8)(U), (int)(R)))
5322 #define _mm512_fixupimm_pd(A, B, C, imm) \
5323 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5324 (__v8df)(__m512d)(B), \
5325 (__v8di)(__m512i)(C), (int)(imm), \
5327 _MM_FROUND_CUR_DIRECTION))
5329 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
5330 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5331 (__v8df)(__m512d)(B), \
5332 (__v8di)(__m512i)(C), (int)(imm), \
5334 _MM_FROUND_CUR_DIRECTION))
5336 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
5337 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5338 (__v8df)(__m512d)(B), \
5339 (__v8di)(__m512i)(C), \
5340 (int)(imm), (__mmask8)(U), \
5343 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
5344 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5345 (__v8df)(__m512d)(B), \
5346 (__v8di)(__m512i)(C), \
5347 (int)(imm), (__mmask8)(U), \
5348 _MM_FROUND_CUR_DIRECTION))
5350 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
5351 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5352 (__v16sf)(__m512)(B), \
5353 (__v16si)(__m512i)(C), (int)(imm), \
5354 (__mmask16)-1, (int)(R)))
5356 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
5357 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5358 (__v16sf)(__m512)(B), \
5359 (__v16si)(__m512i)(C), (int)(imm), \
5360 (__mmask16)(U), (int)(R)))
5362 #define _mm512_fixupimm_ps(A, B, C, imm) \
5363 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5364 (__v16sf)(__m512)(B), \
5365 (__v16si)(__m512i)(C), (int)(imm), \
5367 _MM_FROUND_CUR_DIRECTION))
5369 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
5370 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5371 (__v16sf)(__m512)(B), \
5372 (__v16si)(__m512i)(C), (int)(imm), \
5374 _MM_FROUND_CUR_DIRECTION))
5376 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
5377 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5378 (__v16sf)(__m512)(B), \
5379 (__v16si)(__m512i)(C), \
5380 (int)(imm), (__mmask16)(U), \
5383 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
5384 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5385 (__v16sf)(__m512)(B), \
5386 (__v16si)(__m512i)(C), \
5387 (int)(imm), (__mmask16)(U), \
5388 _MM_FROUND_CUR_DIRECTION))
5390 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
5391 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5392 (__v2df)(__m128d)(B), \
5393 (__v2di)(__m128i)(C), (int)(imm), \
5394 (__mmask8)-1, (int)(R)))