11#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
14#ifndef __AVX512VLINTRIN_H
15#define __AVX512VLINTRIN_H
17#define __DEFAULT_FN_ATTRS128 \
18 __attribute__((__always_inline__, __nodebug__, \
19 __target__("avx512vl,no-evex512"), \
20 __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx512vl,no-evex512"), \
24 __min_vector_width__(256)))
32#define _mm_cmpeq_epi32_mask(A, B) \
33 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
34#define _mm_mask_cmpeq_epi32_mask(k, A, B) \
35 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
36#define _mm_cmpge_epi32_mask(A, B) \
37 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
38#define _mm_mask_cmpge_epi32_mask(k, A, B) \
39 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
40#define _mm_cmpgt_epi32_mask(A, B) \
41 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
42#define _mm_mask_cmpgt_epi32_mask(k, A, B) \
43 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
44#define _mm_cmple_epi32_mask(A, B) \
45 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
46#define _mm_mask_cmple_epi32_mask(k, A, B) \
47 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
48#define _mm_cmplt_epi32_mask(A, B) \
49 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
50#define _mm_mask_cmplt_epi32_mask(k, A, B) \
51 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
52#define _mm_cmpneq_epi32_mask(A, B) \
53 _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
54#define _mm_mask_cmpneq_epi32_mask(k, A, B) \
55 _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
57#define _mm256_cmpeq_epi32_mask(A, B) \
58 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
59#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \
60 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
61#define _mm256_cmpge_epi32_mask(A, B) \
62 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
63#define _mm256_mask_cmpge_epi32_mask(k, A, B) \
64 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
65#define _mm256_cmpgt_epi32_mask(A, B) \
66 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
67#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \
68 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
69#define _mm256_cmple_epi32_mask(A, B) \
70 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
71#define _mm256_mask_cmple_epi32_mask(k, A, B) \
72 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
73#define _mm256_cmplt_epi32_mask(A, B) \
74 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
75#define _mm256_mask_cmplt_epi32_mask(k, A, B) \
76 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
77#define _mm256_cmpneq_epi32_mask(A, B) \
78 _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
79#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \
80 _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
82#define _mm_cmpeq_epu32_mask(A, B) \
83 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
84#define _mm_mask_cmpeq_epu32_mask(k, A, B) \
85 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
86#define _mm_cmpge_epu32_mask(A, B) \
87 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
88#define _mm_mask_cmpge_epu32_mask(k, A, B) \
89 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
90#define _mm_cmpgt_epu32_mask(A, B) \
91 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
92#define _mm_mask_cmpgt_epu32_mask(k, A, B) \
93 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
94#define _mm_cmple_epu32_mask(A, B) \
95 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
96#define _mm_mask_cmple_epu32_mask(k, A, B) \
97 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
98#define _mm_cmplt_epu32_mask(A, B) \
99 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
100#define _mm_mask_cmplt_epu32_mask(k, A, B) \
101 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
102#define _mm_cmpneq_epu32_mask(A, B) \
103 _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
104#define _mm_mask_cmpneq_epu32_mask(k, A, B) \
105 _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
107#define _mm256_cmpeq_epu32_mask(A, B) \
108 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
109#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \
110 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
111#define _mm256_cmpge_epu32_mask(A, B) \
112 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
113#define _mm256_mask_cmpge_epu32_mask(k, A, B) \
114 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
115#define _mm256_cmpgt_epu32_mask(A, B) \
116 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
117#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \
118 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
119#define _mm256_cmple_epu32_mask(A, B) \
120 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
121#define _mm256_mask_cmple_epu32_mask(k, A, B) \
122 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
123#define _mm256_cmplt_epu32_mask(A, B) \
124 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
125#define _mm256_mask_cmplt_epu32_mask(k, A, B) \
126 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
127#define _mm256_cmpneq_epu32_mask(A, B) \
128 _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
129#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \
130 _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
132#define _mm_cmpeq_epi64_mask(A, B) \
133 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
134#define _mm_mask_cmpeq_epi64_mask(k, A, B) \
135 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
136#define _mm_cmpge_epi64_mask(A, B) \
137 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
138#define _mm_mask_cmpge_epi64_mask(k, A, B) \
139 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
140#define _mm_cmpgt_epi64_mask(A, B) \
141 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
142#define _mm_mask_cmpgt_epi64_mask(k, A, B) \
143 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
144#define _mm_cmple_epi64_mask(A, B) \
145 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
146#define _mm_mask_cmple_epi64_mask(k, A, B) \
147 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
148#define _mm_cmplt_epi64_mask(A, B) \
149 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
150#define _mm_mask_cmplt_epi64_mask(k, A, B) \
151 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
152#define _mm_cmpneq_epi64_mask(A, B) \
153 _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
154#define _mm_mask_cmpneq_epi64_mask(k, A, B) \
155 _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
157#define _mm256_cmpeq_epi64_mask(A, B) \
158 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
159#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \
160 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
161#define _mm256_cmpge_epi64_mask(A, B) \
162 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
163#define _mm256_mask_cmpge_epi64_mask(k, A, B) \
164 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
165#define _mm256_cmpgt_epi64_mask(A, B) \
166 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
167#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \
168 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
169#define _mm256_cmple_epi64_mask(A, B) \
170 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
171#define _mm256_mask_cmple_epi64_mask(k, A, B) \
172 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
173#define _mm256_cmplt_epi64_mask(A, B) \
174 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
175#define _mm256_mask_cmplt_epi64_mask(k, A, B) \
176 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
177#define _mm256_cmpneq_epi64_mask(A, B) \
178 _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
179#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \
180 _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
182#define _mm_cmpeq_epu64_mask(A, B) \
183 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
184#define _mm_mask_cmpeq_epu64_mask(k, A, B) \
185 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
186#define _mm_cmpge_epu64_mask(A, B) \
187 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
188#define _mm_mask_cmpge_epu64_mask(k, A, B) \
189 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
190#define _mm_cmpgt_epu64_mask(A, B) \
191 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
192#define _mm_mask_cmpgt_epu64_mask(k, A, B) \
193 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
194#define _mm_cmple_epu64_mask(A, B) \
195 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
196#define _mm_mask_cmple_epu64_mask(k, A, B) \
197 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
198#define _mm_cmplt_epu64_mask(A, B) \
199 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
200#define _mm_mask_cmplt_epu64_mask(k, A, B) \
201 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
202#define _mm_cmpneq_epu64_mask(A, B) \
203 _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
204#define _mm_mask_cmpneq_epu64_mask(k, A, B) \
205 _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
207#define _mm256_cmpeq_epu64_mask(A, B) \
208 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
209#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \
210 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
211#define _mm256_cmpge_epu64_mask(A, B) \
212 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
213#define _mm256_mask_cmpge_epu64_mask(k, A, B) \
214 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
215#define _mm256_cmpgt_epu64_mask(A, B) \
216 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
217#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \
218 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
219#define _mm256_cmple_epu64_mask(A, B) \
220 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
221#define _mm256_mask_cmple_epu64_mask(k, A, B) \
222 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
223#define _mm256_cmplt_epu64_mask(A, B) \
224 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
225#define _mm256_mask_cmplt_epu64_mask(k, A, B) \
226 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
227#define _mm256_cmpneq_epu64_mask(A, B) \
228 _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
229#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \
230 _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
235 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
243 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
251 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
259 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
267 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
275 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
283 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
291 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
299 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
307 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
315 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
323 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
331 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
339 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
347 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
355 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
363 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
371 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
379 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
387 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
395 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
403 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
411 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
419 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
427 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
435 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
443 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
451 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
459 return (__m256i)((__v8su)
__a & (__v8su)
__b);
465 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
479 return (__m128i)((__v4su)
__a & (__v4su)
__b);
485 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
499 return (__m256i)(~(__v8su)__A & (__v8su)__B);
505 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
520 return (__m128i)(~(__v4su)__A & (__v4su)__B);
526 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
540 return (__m256i)((__v8su)
__a | (__v8su)
__b);
546 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
560 return (__m128i)((__v4su)
__a | (__v4su)
__b);
566 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
580 return (__m256i)((__v8su)
__a ^ (__v8su)
__b);
586 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
600 return (__m128i)((__v4su)
__a ^ (__v4su)
__b);
606 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
620 return (__m256i)((__v4du)
__a & (__v4du)
__b);
626 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
640 return (__m128i)((__v2du)
__a & (__v2du)
__b);
646 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
660 return (__m256i)(~(__v4du)__A & (__v4du)__B);
666 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
681 return (__m128i)(~(__v2du)__A & (__v2du)__B);
687 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
701 return (__m256i)((__v4du)
__a | (__v4du)
__b);
707 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
721 return (__m128i)((__v2du)
__a | (__v2du)
__b);
727 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
741 return (__m256i)((__v4du)
__a ^ (__v4du)
__b);
747 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
761 return (__m128i)((__v2du)
__a ^ (__v2du)
__b);
768 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
779#define _mm_cmp_epi32_mask(a, b, p) \
780 ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
781 (__v4si)(__m128i)(b), (int)(p), \
784#define _mm_mask_cmp_epi32_mask(m, a, b, p) \
785 ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
786 (__v4si)(__m128i)(b), (int)(p), \
789#define _mm_cmp_epu32_mask(a, b, p) \
790 ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
791 (__v4si)(__m128i)(b), (int)(p), \
794#define _mm_mask_cmp_epu32_mask(m, a, b, p) \
795 ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
796 (__v4si)(__m128i)(b), (int)(p), \
799#define _mm256_cmp_epi32_mask(a, b, p) \
800 ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
801 (__v8si)(__m256i)(b), (int)(p), \
804#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
805 ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
806 (__v8si)(__m256i)(b), (int)(p), \
809#define _mm256_cmp_epu32_mask(a, b, p) \
810 ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
811 (__v8si)(__m256i)(b), (int)(p), \
814#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
815 ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
816 (__v8si)(__m256i)(b), (int)(p), \
819#define _mm_cmp_epi64_mask(a, b, p) \
820 ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
821 (__v2di)(__m128i)(b), (int)(p), \
824#define _mm_mask_cmp_epi64_mask(m, a, b, p) \
825 ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
826 (__v2di)(__m128i)(b), (int)(p), \
829#define _mm_cmp_epu64_mask(a, b, p) \
830 ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
831 (__v2di)(__m128i)(b), (int)(p), \
834#define _mm_mask_cmp_epu64_mask(m, a, b, p) \
835 ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
836 (__v2di)(__m128i)(b), (int)(p), \
839#define _mm256_cmp_epi64_mask(a, b, p) \
840 ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
841 (__v4di)(__m256i)(b), (int)(p), \
844#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
845 ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
846 (__v4di)(__m256i)(b), (int)(p), \
849#define _mm256_cmp_epu64_mask(a, b, p) \
850 ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
851 (__v4di)(__m256i)(b), (int)(p), \
854#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
855 ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
856 (__v4di)(__m256i)(b), (int)(p), \
859#define _mm256_cmp_ps_mask(a, b, p) \
860 ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
861 (__v8sf)(__m256)(b), (int)(p), \
864#define _mm256_mask_cmp_ps_mask(m, a, b, p) \
865 ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
866 (__v8sf)(__m256)(b), (int)(p), \
869#define _mm256_cmp_pd_mask(a, b, p) \
870 ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
871 (__v4df)(__m256d)(b), (int)(p), \
874#define _mm256_mask_cmp_pd_mask(m, a, b, p) \
875 ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
876 (__v4df)(__m256d)(b), (int)(p), \
879#define _mm_cmp_ps_mask(a, b, p) \
880 ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
881 (__v4sf)(__m128)(b), (int)(p), \
884#define _mm_mask_cmp_ps_mask(m, a, b, p) \
885 ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
886 (__v4sf)(__m128)(b), (int)(p), \
889#define _mm_cmp_pd_mask(a, b, p) \
890 ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
891 (__v2df)(__m128d)(b), (int)(p), \
894#define _mm_mask_cmp_pd_mask(m, a, b, p) \
895 ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
896 (__v2df)(__m128d)(b), (int)(p), \
902 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
903 __builtin_ia32_vfmaddpd ((__v2df) __A,
912 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
913 __builtin_ia32_vfmaddpd ((__v2df) __A,
922 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
923 __builtin_ia32_vfmaddpd ((__v2df) __A,
932 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
933 __builtin_ia32_vfmaddpd ((__v2df) __A,
942 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
943 __builtin_ia32_vfmaddpd ((__v2df) __A,
952 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
953 __builtin_ia32_vfmaddpd (-(__v2df) __A,
962 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
963 __builtin_ia32_vfmaddpd (-(__v2df) __A,
972 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
973 __builtin_ia32_vfmaddpd (-(__v2df) __A,
982 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
983 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
992 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
993 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1002 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1003 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1012 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1013 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1022 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1023 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1032 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1033 __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
1042 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1043 __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
1052 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1053 __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
1062 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1063 __builtin_ia32_vfmaddps ((__v4sf) __A,
1072 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1073 __builtin_ia32_vfmaddps ((__v4sf) __A,
1082 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1083 __builtin_ia32_vfmaddps ((__v4sf) __A,
1092 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1093 __builtin_ia32_vfmaddps ((__v4sf) __A,
1102 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1103 __builtin_ia32_vfmaddps ((__v4sf) __A,
1112 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1113 __builtin_ia32_vfmaddps (-(__v4sf) __A,
1122 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1123 __builtin_ia32_vfmaddps (-(__v4sf) __A,
1132 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1133 __builtin_ia32_vfmaddps (-(__v4sf) __A,
1142 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1143 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1152 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1153 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1162 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1163 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1172 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1173 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1182 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1183 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1192 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1193 __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
1202 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1203 __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
1212 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1213 __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
1222 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1223 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1232 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1233 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1242 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1243 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1252 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1253 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1262 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1263 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1272 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1273 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1282 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1283 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1292 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1293 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1302 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1303 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1312 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1313 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1322 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1323 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1332 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1333 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1342 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1343 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1352 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1353 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1362 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1363 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1373 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1374 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1383 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1384 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1393 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1394 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1403 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1404 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1413 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1414 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1423 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1424 __builtin_ia32_vfmaddpd ((__v2df) __A,
1433 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1434 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1443 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1444 __builtin_ia32_vfmaddps ((__v4sf) __A,
1453 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1454 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1463 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1464 __builtin_ia32_vfmaddsubpd ((__v2df) __A,
1473 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1474 __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
1483 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1484 __builtin_ia32_vfmaddsubps ((__v4sf) __A,
1493 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1494 __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
1503 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1504 __builtin_ia32_vfmaddpd ((__v2df) __A,
1513 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1514 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1523 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1524 __builtin_ia32_vfmaddps ((__v4sf) __A,
1533 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1534 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1543 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1544 __builtin_ia32_vfmaddpd ((__v2df) __A,
1553 return (__m128d) __builtin_ia32_selectpd_128((
__mmask8) __U,
1554 __builtin_ia32_vfmaddpd ((__v2df) __A,
1563 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1564 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1573 return (__m256d) __builtin_ia32_selectpd_256((
__mmask8) __U,
1574 __builtin_ia32_vfmaddpd256 ((__v4df) __A,
1583 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1584 __builtin_ia32_vfmaddps ((__v4sf) __A,
1593 return (__m128) __builtin_ia32_selectps_128((
__mmask8) __U,
1594 __builtin_ia32_vfmaddps ((__v4sf) __A,
1603 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1604 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1613 return (__m256) __builtin_ia32_selectps_256((
__mmask8) __U,
1614 __builtin_ia32_vfmaddps256 ((__v8sf) __A,
1622 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
1629 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
1636 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
1643 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
1650 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
1657 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
1664 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
1671 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
1678 return (__m128i) __builtin_ia32_selectd_128 ((
__mmask8) __U,
1685 return (__m256i) __builtin_ia32_selectd_256 ((
__mmask8) __U,
1692 return (__m128d) __builtin_ia32_selectpd_128 ((
__mmask8) __U,
1699 return (__m256d) __builtin_ia32_selectpd_256 ((
__mmask8) __U,
1706 return (__m128) __builtin_ia32_selectps_128 ((
__mmask8) __U,
1713 return (__m256) __builtin_ia32_selectps_256 ((
__mmask8) __U,
1720 return (__m128i) __builtin_ia32_selectq_128 ((
__mmask8) __U,
1727 return (__m256i) __builtin_ia32_selectq_256 ((
__mmask8) __U,
1734 return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
1741 return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
1749 return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
1756 return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
1764 return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
1771 return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
1779 return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
1786 return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
1794 return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
1801 return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
1809 return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
1816 return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
1824 return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
1831 return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
1839 return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
1846 return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
1854 __builtin_ia32_compressstoredf128_mask ((__v2df *)
__P,
1861 __builtin_ia32_compressstoredf256_mask ((__v4df *)
__P,
1868 __builtin_ia32_compressstoredi128_mask ((__v2di *)
__P,
1875 __builtin_ia32_compressstoredi256_mask ((__v4di *)
__P,
1882 __builtin_ia32_compressstoresf128_mask ((__v4sf *)
__P,
1889 __builtin_ia32_compressstoresf256_mask ((__v8sf *)
__P,
1896 __builtin_ia32_compressstoresi128_mask ((__v4si *)
__P,
1903 __builtin_ia32_compressstoresi256_mask ((__v8si *)
__P,
1910 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8) __U,
1917 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8) __U,
1924 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8) __U,
1931 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8) __U,
1938 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
1945 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
1952 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
1959 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
1966 return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
1973 return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
1981 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
1988 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
1995 return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
2002 return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
2010 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2017 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2024 return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
2032 return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
2039 return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
2047 return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
2055 return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
2062 return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
2070 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2077 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2084 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2091 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2098 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2105 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2112 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2119 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2126 return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
2134 return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
2141 return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
2149 return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
2157 return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
2164 return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
2172 return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
2179 return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
2187 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2194 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2201 return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
2209 return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
2216 return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
2224 return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
2232 return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
2239 return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
2247 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2254 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2261 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2268 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2275 return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
2283 return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
2290 return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
2298 return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
2306 return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
2313 return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
2321 return (__m128d) __builtin_convertvector(
2322 __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
2327 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8) __U,
2334 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8) __U,
2341 return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
2346 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8) __U,
2353 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8) __U,
2360 return (__m128)__builtin_convertvector((__v4su)__A, __v4sf);
2365 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2372 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2379 return (__m256)__builtin_convertvector((__v8su)__A, __v8sf);
2384 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2391 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2398 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2405 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2412 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2419 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2426 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2433 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2440 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2447 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2454 return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
2461 return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
2469 return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
2476 return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
2484 return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
2491 return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
2499 return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
2506 return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
2514 return (__m128d) __builtin_ia32_expandloaddf128_mask ((
const __v2df *)
__P,
2522 return (__m128d) __builtin_ia32_expandloaddf128_mask ((
const __v2df *)
__P,
2531 return (__m256d) __builtin_ia32_expandloaddf256_mask ((
const __v4df *)
__P,
2539 return (__m256d) __builtin_ia32_expandloaddf256_mask ((
const __v4df *)
__P,
2548 return (__m128i) __builtin_ia32_expandloaddi128_mask ((
const __v2di *)
__P,
2556 return (__m128i) __builtin_ia32_expandloaddi128_mask ((
const __v2di *)
__P,
2566 return (__m256i) __builtin_ia32_expandloaddi256_mask ((
const __v4di *)
__P,
2574 return (__m256i) __builtin_ia32_expandloaddi256_mask ((
const __v4di *)
__P,
2583 return (__m128) __builtin_ia32_expandloadsf128_mask ((
const __v4sf *)
__P,
2590 return (__m128) __builtin_ia32_expandloadsf128_mask ((
const __v4sf *)
__P,
2599 return (__m256) __builtin_ia32_expandloadsf256_mask ((
const __v8sf *)
__P,
2606 return (__m256) __builtin_ia32_expandloadsf256_mask ((
const __v8sf *)
__P,
2615 return (__m128i) __builtin_ia32_expandloadsi128_mask ((
const __v4si *)
__P,
2623 return (__m128i) __builtin_ia32_expandloadsi128_mask ((
const __v4si *)
__P,
2632 return (__m256i) __builtin_ia32_expandloadsi256_mask ((
const __v8si *)
__P,
2640 return (__m256i) __builtin_ia32_expandloadsi256_mask ((
const __v8si *)
__P,
2649 return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
2656 return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
2664 return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
2671 return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
2679 return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
2686 return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
2694 return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
2701 return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
2709 return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
2717 return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
2724 return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
2732 return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
2740 return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
2747 return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
2755 return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
2763 return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
2770 return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
2778 return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
2786 return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
2793 return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
2801 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2808 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2815 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2822 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2829 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2836 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2843 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2850 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2857 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2864 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2871 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2878 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2885 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2892 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2899 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2906 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2913 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2920 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
2927 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2934 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
2941 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2948 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
2955 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2962 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
2969 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2976 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
2983 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2990 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
2997 return (__m128i)__builtin_elementwise_abs((__v2di)__A);
3002 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
3009 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
3016 return (__m256i)__builtin_elementwise_abs((__v4di)__A);
3021 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
3028 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
3035 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3042 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3049 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3056 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3063 return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B);
3068 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3075 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3082 return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B);
3087 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3094 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3101 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3108 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3115 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3122 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3129 return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B);
3134 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3141 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3148 return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B);
3153 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3160 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3167 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3174 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3181 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3188 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3195 return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B);
3200 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3207 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3214 return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B);
3219 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3226 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3233 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3240 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
3247 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3254 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
3261 return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B);
3266 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3273 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__M,
3280 return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B);
3285 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3292 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
3297#define _mm_roundscale_pd(A, imm) \
3298 ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
3300 (__v2df)_mm_setzero_pd(), \
3304#define _mm_mask_roundscale_pd(W, U, A, imm) \
3305 ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
3307 (__v2df)(__m128d)(W), \
3311#define _mm_maskz_roundscale_pd(U, A, imm) \
3312 ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
3314 (__v2df)_mm_setzero_pd(), \
3318#define _mm256_roundscale_pd(A, imm) \
3319 ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
3321 (__v4df)_mm256_setzero_pd(), \
3325#define _mm256_mask_roundscale_pd(W, U, A, imm) \
3326 ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
3328 (__v4df)(__m256d)(W), \
3332#define _mm256_maskz_roundscale_pd(U, A, imm) \
3333 ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
3335 (__v4df)_mm256_setzero_pd(), \
3338#define _mm_roundscale_ps(A, imm) \
3339 ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
3340 (__v4sf)_mm_setzero_ps(), \
3344#define _mm_mask_roundscale_ps(W, U, A, imm) \
3345 ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
3346 (__v4sf)(__m128)(W), \
3350#define _mm_maskz_roundscale_ps(U, A, imm) \
3351 ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
3352 (__v4sf)_mm_setzero_ps(), \
3355#define _mm256_roundscale_ps(A, imm) \
3356 ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
3357 (__v8sf)_mm256_setzero_ps(), \
3360#define _mm256_mask_roundscale_ps(W, U, A, imm) \
3361 ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
3362 (__v8sf)(__m256)(W), \
3366#define _mm256_maskz_roundscale_ps(U, A, imm) \
3367 ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
3368 (__v8sf)_mm256_setzero_ps(), \
3373 return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
3383 return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
3391 return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
3400 return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
3410 return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
3418 return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
3427 return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
3436 return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
3444 return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
3453 return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
3463 return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
3471 return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
3478#define _mm_i64scatter_pd(addr, index, v1, scale) \
3479 __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \
3480 (__v2di)(__m128i)(index), \
3481 (__v2df)(__m128d)(v1), (int)(scale))
3483#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \
3484 __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \
3485 (__v2di)(__m128i)(index), \
3486 (__v2df)(__m128d)(v1), (int)(scale))
3488#define _mm_i64scatter_epi64(addr, index, v1, scale) \
3489 __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \
3490 (__v2di)(__m128i)(index), \
3491 (__v2di)(__m128i)(v1), (int)(scale))
3493#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
3494 __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \
3495 (__v2di)(__m128i)(index), \
3496 (__v2di)(__m128i)(v1), (int)(scale))
3498#define _mm256_i64scatter_pd(addr, index, v1, scale) \
3499 __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \
3500 (__v4di)(__m256i)(index), \
3501 (__v4df)(__m256d)(v1), (int)(scale))
3503#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \
3504 __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \
3505 (__v4di)(__m256i)(index), \
3506 (__v4df)(__m256d)(v1), (int)(scale))
3508#define _mm256_i64scatter_epi64(addr, index, v1, scale) \
3509 __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \
3510 (__v4di)(__m256i)(index), \
3511 (__v4di)(__m256i)(v1), (int)(scale))
3513#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
3514 __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \
3515 (__v4di)(__m256i)(index), \
3516 (__v4di)(__m256i)(v1), (int)(scale))
3518#define _mm_i64scatter_ps(addr, index, v1, scale) \
3519 __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \
3520 (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
3523#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \
3524 __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \
3525 (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
3528#define _mm_i64scatter_epi32(addr, index, v1, scale) \
3529 __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \
3530 (__v2di)(__m128i)(index), \
3531 (__v4si)(__m128i)(v1), (int)(scale))
3533#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
3534 __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \
3535 (__v2di)(__m128i)(index), \
3536 (__v4si)(__m128i)(v1), (int)(scale))
3538#define _mm256_i64scatter_ps(addr, index, v1, scale) \
3539 __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \
3540 (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
3543#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \
3544 __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \
3545 (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
3548#define _mm256_i64scatter_epi32(addr, index, v1, scale) \
3549 __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \
3550 (__v4di)(__m256i)(index), \
3551 (__v4si)(__m128i)(v1), (int)(scale))
3553#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
3554 __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \
3555 (__v4di)(__m256i)(index), \
3556 (__v4si)(__m128i)(v1), (int)(scale))
3558#define _mm_i32scatter_pd(addr, index, v1, scale) \
3559 __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \
3560 (__v4si)(__m128i)(index), \
3561 (__v2df)(__m128d)(v1), (int)(scale))
3563#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \
3564 __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \
3565 (__v4si)(__m128i)(index), \
3566 (__v2df)(__m128d)(v1), (int)(scale))
3568#define _mm_i32scatter_epi64(addr, index, v1, scale) \
3569 __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \
3570 (__v4si)(__m128i)(index), \
3571 (__v2di)(__m128i)(v1), (int)(scale))
3573#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
3574 __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \
3575 (__v4si)(__m128i)(index), \
3576 (__v2di)(__m128i)(v1), (int)(scale))
3578#define _mm256_i32scatter_pd(addr, index, v1, scale) \
3579 __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \
3580 (__v4si)(__m128i)(index), \
3581 (__v4df)(__m256d)(v1), (int)(scale))
3583#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \
3584 __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \
3585 (__v4si)(__m128i)(index), \
3586 (__v4df)(__m256d)(v1), (int)(scale))
3588#define _mm256_i32scatter_epi64(addr, index, v1, scale) \
3589 __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \
3590 (__v4si)(__m128i)(index), \
3591 (__v4di)(__m256i)(v1), (int)(scale))
3593#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
3594 __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \
3595 (__v4si)(__m128i)(index), \
3596 (__v4di)(__m256i)(v1), (int)(scale))
3598#define _mm_i32scatter_ps(addr, index, v1, scale) \
3599 __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \
3600 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
3603#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \
3604 __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \
3605 (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
3608#define _mm_i32scatter_epi32(addr, index, v1, scale) \
3609 __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \
3610 (__v4si)(__m128i)(index), \
3611 (__v4si)(__m128i)(v1), (int)(scale))
3613#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
3614 __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \
3615 (__v4si)(__m128i)(index), \
3616 (__v4si)(__m128i)(v1), (int)(scale))
3618#define _mm256_i32scatter_ps(addr, index, v1, scale) \
3619 __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \
3620 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
3623#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \
3624 __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \
3625 (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
3628#define _mm256_i32scatter_epi32(addr, index, v1, scale) \
3629 __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \
3630 (__v8si)(__m256i)(index), \
3631 (__v8si)(__m256i)(v1), (int)(scale))
3633#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
3634 __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \
3635 (__v8si)(__m256i)(index), \
3636 (__v8si)(__m256i)(v1), (int)(scale))
3640 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
3647 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
3654 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
3661 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
3668 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
3675 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
3682 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
3689 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
3696 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
3703 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
3710 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
3717 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
3724 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
3731 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
3738 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
3745 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
3752 return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
3759 return (__m128i)__builtin_ia32_selectd_128(__U,
3767 return (__m128i)__builtin_ia32_selectd_128(__U,
3775 return (__m128i)__builtin_ia32_selectd_128(__U,
3782 return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
3789 return (__m256i)__builtin_ia32_selectd_256(__U,
3797 return (__m256i)__builtin_ia32_selectd_256(__U,
3805 return (__m256i)__builtin_ia32_selectd_256(__U,
3812 return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
3818 return (__m128d)__builtin_ia32_selectpd_128(__U,
3825 return (__m128d)__builtin_ia32_selectpd_128(__U,
3827 (__v2df)(__m128d)__I);
3832 return (__m128d)__builtin_ia32_selectpd_128(__U,
3839 return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
3846 return (__m256d)__builtin_ia32_selectpd_256(__U,
3854 return (__m256d)__builtin_ia32_selectpd_256(__U,
3856 (__v4df)(__m256d)__I);
3862 return (__m256d)__builtin_ia32_selectpd_256(__U,
3869 return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
3875 return (__m128)__builtin_ia32_selectps_128(__U,
3882 return (__m128)__builtin_ia32_selectps_128(__U,
3884 (__v4sf)(__m128)__I);
3889 return (__m128)__builtin_ia32_selectps_128(__U,
3896 return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
3902 return (__m256)__builtin_ia32_selectps_256(__U,
3910 return (__m256)__builtin_ia32_selectps_256(__U,
3912 (__v8sf)(__m256)__I);
3918 return (__m256)__builtin_ia32_selectps_256(__U,
3925 return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
3932 return (__m128i)__builtin_ia32_selectq_128(__U,
3940 return (__m128i)__builtin_ia32_selectq_128(__U,
3948 return (__m128i)__builtin_ia32_selectq_128(__U,
3956 return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
3963 return (__m256i)__builtin_ia32_selectq_256(__U,
3971 return (__m256i)__builtin_ia32_selectq_256(__U,
3979 return (__m256i)__builtin_ia32_selectq_256(__U,
3987 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
3995 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4003 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4011 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4019 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4027 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4035 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4043 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4051 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4059 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4067 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4075 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4083 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4091 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4099 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4107 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4115 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4123 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4131 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4139 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4148 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4156 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4164 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4172 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4180 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4188 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4196 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4204 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4212 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4220 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4228 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4236 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4244 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4252 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4260 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4268 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4276 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4284 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4292 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4300 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4306#define _mm_rol_epi32(a, b) \
4307 ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)))
4309#define _mm_mask_rol_epi32(w, u, a, b) \
4310 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
4311 (__v4si)_mm_rol_epi32((a), (b)), \
4312 (__v4si)(__m128i)(w)))
4314#define _mm_maskz_rol_epi32(u, a, b) \
4315 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
4316 (__v4si)_mm_rol_epi32((a), (b)), \
4317 (__v4si)_mm_setzero_si128()))
4319#define _mm256_rol_epi32(a, b) \
4320 ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)))
4322#define _mm256_mask_rol_epi32(w, u, a, b) \
4323 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
4324 (__v8si)_mm256_rol_epi32((a), (b)), \
4325 (__v8si)(__m256i)(w)))
4327#define _mm256_maskz_rol_epi32(u, a, b) \
4328 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
4329 (__v8si)_mm256_rol_epi32((a), (b)), \
4330 (__v8si)_mm256_setzero_si256()))
4332#define _mm_rol_epi64(a, b) \
4333 ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)))
4335#define _mm_mask_rol_epi64(w, u, a, b) \
4336 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
4337 (__v2di)_mm_rol_epi64((a), (b)), \
4338 (__v2di)(__m128i)(w)))
4340#define _mm_maskz_rol_epi64(u, a, b) \
4341 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
4342 (__v2di)_mm_rol_epi64((a), (b)), \
4343 (__v2di)_mm_setzero_si128()))
4345#define _mm256_rol_epi64(a, b) \
4346 ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)))
4348#define _mm256_mask_rol_epi64(w, u, a, b) \
4349 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
4350 (__v4di)_mm256_rol_epi64((a), (b)), \
4351 (__v4di)(__m256i)(w)))
4353#define _mm256_maskz_rol_epi64(u, a, b) \
4354 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
4355 (__v4di)_mm256_rol_epi64((a), (b)), \
4356 (__v4di)_mm256_setzero_si256()))
4361 return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B);
4367 return (__m128i)__builtin_ia32_selectd_128(__U,
4375 return (__m128i)__builtin_ia32_selectd_128(__U,
4383 return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B);
4389 return (__m256i)__builtin_ia32_selectd_256(__U,
4397 return (__m256i)__builtin_ia32_selectd_256(__U,
4405 return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B);
4411 return (__m128i)__builtin_ia32_selectq_128(__U,
4419 return (__m128i)__builtin_ia32_selectq_128(__U,
4427 return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B);
4433 return (__m256i)__builtin_ia32_selectq_256(__U,
4441 return (__m256i)__builtin_ia32_selectq_256(__U,
4446#define _mm_ror_epi32(a, b) \
4447 ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)))
4449#define _mm_mask_ror_epi32(w, u, a, b) \
4450 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
4451 (__v4si)_mm_ror_epi32((a), (b)), \
4452 (__v4si)(__m128i)(w)))
4454#define _mm_maskz_ror_epi32(u, a, b) \
4455 ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
4456 (__v4si)_mm_ror_epi32((a), (b)), \
4457 (__v4si)_mm_setzero_si128()))
4459#define _mm256_ror_epi32(a, b) \
4460 ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)))
4462#define _mm256_mask_ror_epi32(w, u, a, b) \
4463 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
4464 (__v8si)_mm256_ror_epi32((a), (b)), \
4465 (__v8si)(__m256i)(w)))
4467#define _mm256_maskz_ror_epi32(u, a, b) \
4468 ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
4469 (__v8si)_mm256_ror_epi32((a), (b)), \
4470 (__v8si)_mm256_setzero_si256()))
4472#define _mm_ror_epi64(a, b) \
4473 ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)))
4475#define _mm_mask_ror_epi64(w, u, a, b) \
4476 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
4477 (__v2di)_mm_ror_epi64((a), (b)), \
4478 (__v2di)(__m128i)(w)))
4480#define _mm_maskz_ror_epi64(u, a, b) \
4481 ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
4482 (__v2di)_mm_ror_epi64((a), (b)), \
4483 (__v2di)_mm_setzero_si128()))
4485#define _mm256_ror_epi64(a, b) \
4486 ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)))
4488#define _mm256_mask_ror_epi64(w, u, a, b) \
4489 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
4490 (__v4di)_mm256_ror_epi64((a), (b)), \
4491 (__v4di)(__m256i)(w)))
4493#define _mm256_maskz_ror_epi64(u, a, b) \
4494 ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
4495 (__v4di)_mm256_ror_epi64((a), (b)), \
4496 (__v4di)_mm256_setzero_si256()))
4501 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4509 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4517 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4525 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4533 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4541 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4549 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4557 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4565 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4573 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4581 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4589 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4597 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4605 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4613 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4621 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4629 return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B);
4635 return (__m128i)__builtin_ia32_selectd_128(__U,
4643 return (__m128i)__builtin_ia32_selectd_128(__U,
4651 return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B);
4657 return (__m256i)__builtin_ia32_selectd_256(__U,
4665 return (__m256i)__builtin_ia32_selectd_256(__U,
4673 return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B);
4679 return (__m128i)__builtin_ia32_selectq_128(__U,
4687 return (__m128i)__builtin_ia32_selectq_128(__U,
4695 return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B);
4701 return (__m256i)__builtin_ia32_selectq_256(__U,
4709 return (__m256i)__builtin_ia32_selectq_256(__U,
4717 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4725 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4733 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4741 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4749 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4757 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4765 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4773 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4781 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4789 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4797 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4805 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4813 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4821 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4829 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4837 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4845 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4853 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4861 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4869 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4877 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4885 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4893 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4901 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4909 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4917 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4925 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4933 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4941 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4949 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
4957 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4965 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
4973 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4981 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__U,
4989 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
4997 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__U,
5005 return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)
__Y);
5011 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
5019 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
5027 return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di)
__Y);
5033 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,