clang 20.0.0git
avx10_2niintrin.h
Go to the documentation of this file.
1/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX10_2NIINTRIN_H
16#define __AVX10_2NIINTRIN_H
17
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
20 __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
23 __min_vector_width__(256)))
24
25/* VNNI FP16 */
26static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
27 __m128h __A,
28 __m128h __B) {
29 return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
30 (__v8hf)__B);
31}
32
33static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
34 __mmask8 __U,
35 __m128h __A,
36 __m128h __B) {
37 return (__m128)__builtin_ia32_selectps_128(
38 (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
39}
40
41static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
42 __m128 __W,
43 __m128h __A,
44 __m128h __B) {
45 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
46 (__v4sf)_mm_dpph_ps(__W, __A, __B),
47 (__v4sf)_mm_setzero_ps());
48}
49
50static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
51 __m256h __A,
52 __m256h __B) {
53 return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
54 (__v16hf)__B);
55}
56
57static __inline__ __m256 __DEFAULT_FN_ATTRS256
58_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
59 return (__m256)__builtin_ia32_selectps_256(
60 (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
61}
62
63static __inline__ __m256 __DEFAULT_FN_ATTRS256
64_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
65 return (__m256)__builtin_ia32_selectps_256(
66 (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
67 (__v8sf)_mm256_setzero_ps());
68}
69
70/* VMPSADBW */
71#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
72 ((__m128i)__builtin_ia32_selectw_128( \
73 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
74 (__v8hi)(__m128i)(W)))
75
76#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
77 ((__m128i)__builtin_ia32_selectw_128( \
78 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
79 (__v8hi)_mm_setzero_si128()))
80
81#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
82 ((__m256i)__builtin_ia32_selectw_256( \
83 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
84 (__v16hi)(__m256i)(W)))
85
86#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
87 ((__m256i)__builtin_ia32_selectw_256( \
88 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
89 (__v16hi)_mm256_setzero_si256()))
90
91/* VNNI INT8 */
92static __inline__ __m128i __DEFAULT_FN_ATTRS128
93_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
94 return (__m128i)__builtin_ia32_selectd_128(
95 __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
96}
97
98static __inline__ __m128i __DEFAULT_FN_ATTRS128
99_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
100 return (__m128i)__builtin_ia32_selectd_128(
101 __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
102 (__v4si)_mm_setzero_si128());
103}
104
105static __inline__ __m256i __DEFAULT_FN_ATTRS256
106_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
107 return (__m256i)__builtin_ia32_selectd_256(
108 __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
109}
110
111static __inline__ __m256i __DEFAULT_FN_ATTRS256
112_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
113 return (__m256i)__builtin_ia32_selectd_256(
114 __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
115 (__v8si)_mm256_setzero_si256());
116}
117
118static __inline__ __m128i __DEFAULT_FN_ATTRS128
119_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
120 return (__m128i)__builtin_ia32_selectd_128(
121 __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
122}
123
124static __inline__ __m128i __DEFAULT_FN_ATTRS128
125_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
126 return (__m128i)__builtin_ia32_selectd_128(
127 __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
128 (__v4si)_mm_setzero_si128());
129}
130
131static __inline__ __m256i __DEFAULT_FN_ATTRS256
132_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
133 return (__m256i)__builtin_ia32_selectd_256(
134 __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
135}
136
137static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
138 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
139 return (__m256i)__builtin_ia32_selectd_256(
140 __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
141 (__v8si)_mm256_setzero_si256());
142}
143
144static __inline__ __m128i __DEFAULT_FN_ATTRS128
145_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
146 return (__m128i)__builtin_ia32_selectd_128(
147 __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
148}
149
150static __inline__ __m128i __DEFAULT_FN_ATTRS128
151_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
152 return (__m128i)__builtin_ia32_selectd_128(
153 __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
154 (__v4si)_mm_setzero_si128());
155}
156
157static __inline__ __m256i __DEFAULT_FN_ATTRS256
158_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
159 return (__m256i)__builtin_ia32_selectd_256(
160 __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
161}
162
163static __inline__ __m256i __DEFAULT_FN_ATTRS256
164_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
165 return (__m256i)__builtin_ia32_selectd_256(
166 __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
167 (__v8si)_mm256_setzero_si256());
168}
169
170static __inline__ __m128i __DEFAULT_FN_ATTRS128
171_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
172 return (__m128i)__builtin_ia32_selectd_128(
173 __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
174}
175
176static __inline__ __m128i __DEFAULT_FN_ATTRS128
177_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
178 return (__m128i)__builtin_ia32_selectd_128(
179 __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
180 (__v4si)_mm_setzero_si128());
181}
182
183static __inline__ __m256i __DEFAULT_FN_ATTRS256
184_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
185 return (__m256i)__builtin_ia32_selectd_256(
186 __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
187}
188
189static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
190 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
191 return (__m256i)__builtin_ia32_selectd_256(
192 __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
193 (__v8si)_mm256_setzero_si256());
194}
195
196static __inline__ __m128i __DEFAULT_FN_ATTRS128
197_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
198 return (__m128i)__builtin_ia32_selectd_128(
199 __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
200}
201
202static __inline__ __m128i __DEFAULT_FN_ATTRS128
203_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
204 return (__m128i)__builtin_ia32_selectd_128(
205 __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
206 (__v4si)_mm_setzero_si128());
207}
208
209static __inline__ __m256i __DEFAULT_FN_ATTRS256
210_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
211 return (__m256i)__builtin_ia32_selectd_256(
212 __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
213}
214
215static __inline__ __m256i __DEFAULT_FN_ATTRS256
216_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
217 return (__m256i)__builtin_ia32_selectd_256(
218 __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
219 (__v8si)_mm256_setzero_si256());
220}
221
222static __inline__ __m128i __DEFAULT_FN_ATTRS128
223_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
224 return (__m128i)__builtin_ia32_selectd_128(
225 __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
226}
227
228static __inline__ __m128i __DEFAULT_FN_ATTRS128
229_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
230 return (__m128i)__builtin_ia32_selectd_128(
231 __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
232 (__v4si)_mm_setzero_si128());
233}
234
235static __inline__ __m256i __DEFAULT_FN_ATTRS256
236_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
237 return (__m256i)__builtin_ia32_selectd_256(
238 __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
239}
240
241static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
242 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
243 return (__m256i)__builtin_ia32_selectd_256(
244 __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
245 (__v8si)_mm256_setzero_si256());
246}
247
248/* VNNI INT16 */
249static __inline__ __m128i __DEFAULT_FN_ATTRS128
250_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
251 return (__m128i)__builtin_ia32_selectd_128(
252 (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
253}
254
255static __inline__ __m128i __DEFAULT_FN_ATTRS128
256_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
257 return (__m128i)__builtin_ia32_selectd_128(
258 (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
259 (__v4si)_mm_setzero_si128());
260}
261
262static __inline__ __m256i __DEFAULT_FN_ATTRS256
263_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
264 return (__m256i)__builtin_ia32_selectd_256(
265 (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
266}
267
268static __inline__ __m256i __DEFAULT_FN_ATTRS256
269_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
270 return (__m256i)__builtin_ia32_selectd_256(
271 (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
272 (__v8si)_mm256_setzero_si256());
273}
274
275static __inline__ __m128i __DEFAULT_FN_ATTRS128
276_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
277 return (__m128i)__builtin_ia32_selectd_128(
278 (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
279}
280
281static __inline__ __m128i __DEFAULT_FN_ATTRS128
282_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
283 return (__m128i)__builtin_ia32_selectd_128(
284 (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
285 (__v4si)_mm_setzero_si128());
286}
287
288static __inline__ __m256i __DEFAULT_FN_ATTRS256
289_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
290 return (__m256i)__builtin_ia32_selectd_256(
291 (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
292}
293
294static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
295 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
296 return (__m256i)__builtin_ia32_selectd_256(
297 (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
298 (__v8si)_mm256_setzero_si256());
299}
300
301static __inline__ __m128i __DEFAULT_FN_ATTRS128
302_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
303 return (__m128i)__builtin_ia32_selectd_128(
304 (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
305}
306
307static __inline__ __m128i __DEFAULT_FN_ATTRS128
308_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
309 return (__m128i)__builtin_ia32_selectd_128(
310 (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
311 (__v4si)_mm_setzero_si128());
312}
313
314static __inline__ __m256i __DEFAULT_FN_ATTRS256
315_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
316 return (__m256i)__builtin_ia32_selectd_256(
317 (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
318}
319
320static __inline__ __m256i __DEFAULT_FN_ATTRS256
321_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
322 return (__m256i)__builtin_ia32_selectd_256(
323 (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
324 (__v8si)_mm256_setzero_si256());
325}
326
327static __inline__ __m128i __DEFAULT_FN_ATTRS128
328_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
329 return (__m128i)__builtin_ia32_selectd_128(
330 (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
331}
332
333static __inline__ __m128i __DEFAULT_FN_ATTRS128
334_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
335 return (__m128i)__builtin_ia32_selectd_128(
336 (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
337 (__v4si)_mm_setzero_si128());
338}
339
340static __inline__ __m256i __DEFAULT_FN_ATTRS256
341_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
342 return (__m256i)__builtin_ia32_selectd_256(
343 (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
344}
345
346static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
347 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
348 return (__m256i)__builtin_ia32_selectd_256(
349 (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
350 (__v8si)_mm256_setzero_si256());
351}
352
353static __inline__ __m128i __DEFAULT_FN_ATTRS128
354_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
355 return (__m128i)__builtin_ia32_selectd_128(
356 (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
357}
358
359static __inline__ __m128i __DEFAULT_FN_ATTRS128
360_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
361 return (__m128i)__builtin_ia32_selectd_128(
362 (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
363 (__v4si)_mm_setzero_si128());
364}
365
366static __inline__ __m256i __DEFAULT_FN_ATTRS256
367_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
368 return (__m256i)__builtin_ia32_selectd_256(
369 (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
370}
371
372static __inline__ __m256i __DEFAULT_FN_ATTRS256
373_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
374 return (__m256i)__builtin_ia32_selectd_256(
375 (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
376 (__v8si)_mm256_setzero_si256());
377}
378
379static __inline__ __m128i __DEFAULT_FN_ATTRS128
380_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
381 return (__m128i)__builtin_ia32_selectd_128(
382 (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
383}
384
385static __inline__ __m128i __DEFAULT_FN_ATTRS128
386_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
387 return (__m128i)__builtin_ia32_selectd_128(
388 (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
389 (__v4si)_mm_setzero_si128());
390}
391
392static __inline__ __m256i __DEFAULT_FN_ATTRS256
393_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
394 return (__m256i)__builtin_ia32_selectd_256(
395 (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
396}
397
398static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
399 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
400 return (__m256i)__builtin_ia32_selectd_256(
401 (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
402 (__v8si)_mm256_setzero_si256());
403}
404
405/* YMM Rounding */
406#define _mm256_add_round_pd(A, B, R) \
407 ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
408 (__v4df)(__m256d)(B), (int)(R)))
409
410#define _mm256_mask_add_round_pd(W, U, A, B, R) \
411 ((__m256d)__builtin_ia32_selectpd_256( \
412 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
413 (__v4df)(__m256d)(W)))
414
415#define _mm256_maskz_add_round_pd(U, A, B, R) \
416 ((__m256d)__builtin_ia32_selectpd_256( \
417 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
418 (__v4df)_mm256_setzero_pd()))
419
420#define _mm256_add_round_ph(A, B, R) \
421 ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
422 (__v16hf)(__m256h)(B), (int)(R)))
423
424#define _mm256_mask_add_round_ph(W, U, A, B, R) \
425 ((__m256h)__builtin_ia32_selectph_256( \
426 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
427 (__v16hf)(__m256h)(W)))
428
429#define _mm256_maskz_add_round_ph(U, A, B, R) \
430 ((__m256h)__builtin_ia32_selectph_256( \
431 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
432 (__v16hf)_mm256_setzero_ph()))
433
434#define _mm256_add_round_ps(A, B, R) \
435 ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
436 (__v8sf)(__m256)(B), (int)(R)))
437
438#define _mm256_mask_add_round_ps(W, U, A, B, R) \
439 ((__m256)__builtin_ia32_selectps_256( \
440 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
441 (__v8sf)(__m256)(W)))
442
443#define _mm256_maskz_add_round_ps(U, A, B, R) \
444 ((__m256)__builtin_ia32_selectps_256( \
445 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
446 (__v8sf)_mm256_setzero_ps()))
447
448#define _mm256_cmp_round_pd_mask(A, B, P, R) \
449 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \
451 (int)(R)))
452
453#define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \
454 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \
455 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \
456 (int)(R)))
457
458#define _mm256_cmp_round_ph_mask(A, B, P, R) \
459 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \
461 (int)(R)))
462
463#define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \
464 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \
465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \
466 (int)(R)))
467
468#define _mm256_cmp_round_ps_mask(A, B, P, R) \
469 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
470 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \
471 (int)(R)))
472
473#define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \
474 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \
475 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \
476 (int)(R)))
477
478#define _mm256_cvt_roundepi32_ph(A, R) \
479 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
480 (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
481
482#define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \
483 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \
484 (__mmask8)(U), (int)(R)))
485
486#define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \
487 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \
488 (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
489
490#define _mm256_cvt_roundepi32_ps(A, R) \
491 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
492 (__v8sf)_mm256_setzero_ps(), \
493 (__mmask8)-1, (int)(R)))
494
495#define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \
496 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \
497 (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
498
499#define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \
500 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \
501 (__v8sf)_mm256_setzero_ps(), \
502 (__mmask8)(U), (int)(R)))
503
504#define _mm256_cvt_roundpd_epi32(A, R) \
505 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
506 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
507 (int)(R)))
508
509#define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \
510 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
511 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
512
513#define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \
514 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \
515 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
516 (int)(R)))
517
518#define _mm256_cvt_roundpd_ph(A, R) \
519 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
520 (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
521
522#define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \
523 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \
524 (__mmask8)(U), (int)(R)))
525
526#define _mm256_maskz_cvt_roundpd_ph(U, A, R) \
527 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \
528 (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
529
530#define _mm256_cvt_roundpd_ps(A, R) \
531 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
532 (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
533
534#define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \
535 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \
536 (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
537
538#define _mm256_maskz_cvt_roundpd_ps(U, A, R) \
539 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \
540 (__v4sf)_mm_setzero_ps(), \
541 (__mmask8)(U), (int)(R)))
542
543#define _mm256_cvt_roundpd_epi64(A, R) \
544 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
545 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
546 (int)(R)))
547
548#define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \
549 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
550 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
551
552#define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \
553 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \
554 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
555 (int)(R)))
556
557#define _mm256_cvt_roundpd_epu32(A, R) \
558 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
559 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
560 (int)(R)))
561
562#define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \
563 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
564 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
565
566#define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \
567 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \
568 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
569 (int)(R)))
570
571#define _mm256_cvt_roundpd_epu64(A, R) \
572 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
573 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
574 (int)(R)))
575
576#define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \
577 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
578 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
579
580#define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \
581 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \
582 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
583 (int)(R)))
584
585#define _mm256_cvt_roundph_epi32(A, R) \
586 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
587 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
588 (int)(R)))
589
590#define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \
591 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
592 (__mmask8)(U), (int)(R)))
593
594#define _mm256_maskz_cvt_roundph_epi32(U, A, R) \
595 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \
596 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
597
598#define _mm256_cvt_roundph_pd(A, R) \
599 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
600 (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R)))
601
602#define _mm256_mask_cvt_roundph_pd(W, U, A, R) \
603 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \
604 (__mmask8)(U), (int)(R)))
605
606#define _mm256_maskz_cvt_roundph_pd(U, A, R) \
607 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \
608 (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
609
610#define _mm256_cvtx_roundph_ps(A, R) \
611 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
612 (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R)))
613
614#define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \
615 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \
616 (__mmask8)(U), (int)(R)))
617
618#define _mm256_maskz_cvtx_roundph_ps(U, A, R) \
619 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \
620 (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
621
622#define _mm256_cvt_roundph_epi64(A, R) \
623 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
624 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
625 (int)(R)))
626
627#define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \
628 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
629 (__mmask8)(U), (int)(R)))
630
631#define _mm256_maskz_cvt_roundph_epi64(U, A, R) \
632 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \
633 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
634
635#define _mm256_cvt_roundph_epu32(A, R) \
636 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
637 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
638 (int)(R)))
639
640#define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \
641 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
642 (__mmask8)(U), (int)(R)))
643
644#define _mm256_maskz_cvt_roundph_epu32(U, A, R) \
645 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \
646 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
647
648#define _mm256_cvt_roundph_epu64(A, R) \
649 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
650 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
651 (int)(R)))
652
653#define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \
654 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
655 (__mmask8)(U), (int)(R)))
656
657#define _mm256_maskz_cvt_roundph_epu64(U, A, R) \
658 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \
659 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
660
661#define _mm256_cvt_roundph_epu16(A, R) \
662 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
663 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
664 (int)(R)))
665
666#define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \
667 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \
668 (__mmask16)(U), (int)(R)))
669
670#define _mm256_maskz_cvt_roundph_epu16(U, A, R) \
671 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \
672 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
673 (int)(R)))
674
675#define _mm256_cvt_roundph_epi16(A, R) \
676 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
677 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
678 (int)(R)))
679
680#define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \
681 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
682 (__mmask16)(U), (int)(R)))
683
684#define _mm256_maskz_cvt_roundph_epi16(U, A, R) \
685 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \
686 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
687 (int)(R)))
688
689#define _mm256_cvt_roundps_epi32(A, R) \
690 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
691 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
692 (int)(R)))
693
694#define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \
695 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
696 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
697
698#define _mm256_maskz_cvt_roundps_epi32(U, A, R) \
699 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \
700 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
701 (int)(R)))
702
703#define _mm256_cvt_roundps_pd(A, R) \
704 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
705 (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
706 (int)(R)))
707
708#define _mm256_mask_cvt_roundps_pd(W, U, A, R) \
709 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
710 (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
711
712#define _mm256_maskz_cvt_roundps_pd(U, A, R) \
713 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \
714 (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
715 (int)(R)))
716
717#define _mm256_cvt_roundps_ph(A, I) \
718 ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
719 (__v8hi)_mm_undefined_si128(), \
720 (__mmask8)-1))
721
722/* FIXME: We may use these way in future.
723#define _mm256_cvt_roundps_ph(A, I) \
724 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
725 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(), \
726 (__mmask8)-1))
727#define _mm256_mask_cvt_roundps_ph(U, W, A, I) \
728 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
729 (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W)))
730#define _mm256_maskz_cvt_roundps_ph(W, A, I) \
731 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \
732 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(), \
733 (__mmask8)(W))) */
734
735#define _mm256_cvtx_roundps_ph(A, R) \
736 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
737 (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
738
739#define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \
740 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \
741 (__mmask8)(U), (int)(R)))
742
743#define _mm256_maskz_cvtx_roundps_ph(U, A, R) \
744 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \
745 (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
746
747#define _mm256_cvt_roundps_epi64(A, R) \
748 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
749 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
750 (int)(R)))
751
752#define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \
753 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
754 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
755
756#define _mm256_maskz_cvt_roundps_epi64(U, A, R) \
757 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \
758 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
759 (int)(R)))
760
761#define _mm256_cvt_roundps_epu32(A, R) \
762 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
763 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
764 (int)(R)))
765
766#define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \
767 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
768 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
769
770#define _mm256_maskz_cvt_roundps_epu32(U, A, R) \
771 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \
772 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
773 (int)(R)))
774
775#define _mm256_cvt_roundps_epu64(A, R) \
776 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
777 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
778 (int)(R)))
779
780#define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \
781 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
782 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
783
784#define _mm256_maskz_cvt_roundps_epu64(U, A, R) \
785 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \
786 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
787 (int)(R)))
788
789#define _mm256_cvt_roundepi64_pd(A, R) \
790 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
791 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
792 (int)(R)))
793
794#define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \
795 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
796 (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
797
798#define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \
799 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \
800 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
801 (int)(R)))
802
803#define _mm256_cvt_roundepi64_ph(A, R) \
804 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
805 (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
806
807#define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \
808 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \
809 (__mmask8)(U), (int)(R)))
810
811#define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \
812 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \
813 (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
814
815#define _mm256_cvt_roundepi64_ps(A, R) \
816 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
817 (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
818
819#define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \
820 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \
821 (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
822
823#define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \
824 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \
825 (__v4sf)_mm_setzero_ps(), \
826 (__mmask8)(U), (int)(R)))
827
828#define _mm256_cvtt_roundpd_epi32(A, R) \
829 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
830 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \
831 (int)(R)))
832
833#define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \
834 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
835 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R)))
836
837#define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \
838 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \
839 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \
840 (int)(R)))
841
842#define _mm256_cvtt_roundpd_epi64(A, R) \
843 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
844 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
845 (int)(R)))
846
847#define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \
848 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
849 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
850
851#define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \
852 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \
853 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
854 (int)(R)))
855
856#define _mm256_cvtt_roundpd_epu32(A, R) \
857 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
858 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \
859 (int)(R)))
860
861#define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \
862 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
863 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R)))
864
865#define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \
866 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \
867 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \
868 (int)(R)))
869
870#define _mm256_cvtt_roundpd_epu64(A, R) \
871 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
872 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
873 (int)(R)))
874
875#define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \
876 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
877 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
878
879#define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \
880 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \
881 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
882 (int)(R)))
883
884#define _mm256_cvtt_roundph_epi32(A, R) \
885 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
886 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \
887 (int)(R)))
888
889#define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \
890 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \
891 (__mmask8)(U), (int)(R)))
892
893#define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \
894 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \
895 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
896
897#define _mm256_cvtt_roundph_epi64(A, R) \
898 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
899 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \
900 (int)(R)))
901
902#define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \
903 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \
904 (__mmask8)(U), (int)(R)))
905
906#define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \
907 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \
908 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
909
910#define _mm256_cvtt_roundph_epu32(A, R) \
911 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
912 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \
913 (int)(R)))
914
915#define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \
916 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \
917 (__mmask8)(U), (int)(R)))
918
919#define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \
920 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \
921 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
922
923#define _mm256_cvtt_roundph_epu64(A, R) \
924 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
925 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \
926 (int)(R)))
927
928#define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \
929 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \
930 (__mmask8)(U), (int)(R)))
931
932#define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \
933 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \
934 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R)))
935
936#define _mm256_cvtt_roundph_epu16(A, R) \
937 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
938 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \
939 (int)(R)))
940
941#define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \
942 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
943 (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R)))
944
945#define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \
946 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \
947 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
948 (int)(R)))
949
950#define _mm256_cvtt_roundph_epi16(A, R) \
951 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
952 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \
953 (int)(R)))
954
955#define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \
956 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \
957 (__mmask16)(U), (int)(R)))
958
959#define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \
960 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \
961 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \
962 (int)(R)))
963
964#define _mm256_cvtt_roundps_epi32(A, R) \
965 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
966 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \
967 (int)(R)))
968
969#define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \
970 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
971 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R)))
972
973#define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \
974 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \
975 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
976 (int)(R)))
977
978#define _mm256_cvtt_roundps_epi64(A, R) \
979 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
980 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \
981 (int)(R)))
982
983#define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \
984 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
985 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R)))
986
987#define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \
988 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \
989 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \
990 (int)(R)))
991
992#define _mm256_cvtt_roundps_epu32(A, R) \
993 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
994 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
995 (int)(R)))
996
997#define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \
998 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
999 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R)))
1000
1001#define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \
1002 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \
1003 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
1004 (int)(R)))
1005
1006#define _mm256_cvtt_roundps_epu64(A, R) \
1007 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1008 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \
1009 (int)(R)))
1010
1011#define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \
1012 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1013 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R)))
1014
1015#define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \
1016 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \
1017 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \
1018 (int)(R)))
1019
1020#define _mm256_cvt_roundepu32_ph(A, R) \
1021 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1022 (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1023
1024#define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \
1025 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \
1026 (__mmask8)(U), (int)(R)))
1027
1028#define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \
1029 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \
1030 (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1031
1032#define _mm256_cvt_roundepu32_ps(A, R) \
1033 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1034 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \
1035 (int)(R)))
1036
1037#define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \
1038 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1039 (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1040
1041#define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \
1042 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \
1043 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \
1044 (int)(R)))
1045
1046#define _mm256_cvt_roundepu64_pd(A, R) \
1047 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1048 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \
1049 (int)(R)))
1050
1051#define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \
1052 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1053 (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1054
1055#define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \
1056 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \
1057 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1058 (int)(R)))
1059
1060#define _mm256_cvt_roundepu64_ph(A, R) \
1061 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1062 (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1063
1064#define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \
1065 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \
1066 (__mmask8)(U), (int)(R)))
1067
1068#define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \
1069 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \
1070 (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1071
1072#define _mm256_cvt_roundepu64_ps(A, R) \
1073 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1074 (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R)))
1075
1076#define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \
1077 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \
1078 (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
1079
1080#define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \
1081 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \
1082 (__v4sf)_mm_setzero_ps(), \
1083 (__mmask8)(U), (int)(R)))
1084
1085#define _mm256_cvt_roundepu16_ph(A, R) \
1086 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1087 (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1088 (int)(R)))
1089
1090#define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \
1091 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \
1092 (__mmask16)(U), (int)(R)))
1093
1094#define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \
1095 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \
1096 (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1097
1098#define _mm256_cvt_roundepi16_ph(A, R) \
1099 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1100 (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \
1101 (int)(R)))
1102
1103#define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \
1104 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \
1105 (__mmask16)(U), (int)(R)))
1106
1107#define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \
1108 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \
1109 (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1110
1111#define _mm256_div_round_pd(A, B, R) \
1112 ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \
1113 (__v4df)(__m256d)(B), (int)(R)))
1114
1115#define _mm256_mask_div_round_pd(W, U, A, B, R) \
1116 ((__m256d)__builtin_ia32_selectpd_256( \
1117 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1118 (__v4df)(__m256d)(W)))
1119
1120#define _mm256_maskz_div_round_pd(U, A, B, R) \
1121 ((__m256d)__builtin_ia32_selectpd_256( \
1122 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \
1123 (__v4df)_mm256_setzero_pd()))
1124
1125#define _mm256_div_round_ph(A, B, R) \
1126 ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \
1127 (__v16hf)(__m256h)(B), (int)(R)))
1128
1129#define _mm256_mask_div_round_ph(W, U, A, B, R) \
1130 ((__m256h)__builtin_ia32_selectph_256( \
1131 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1132 (__v16hf)(__m256h)(W)))
1133
1134#define _mm256_maskz_div_round_ph(U, A, B, R) \
1135 ((__m256h)__builtin_ia32_selectph_256( \
1136 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \
1137 (__v16hf)_mm256_setzero_ph()))
1138
1139#define _mm256_div_round_ps(A, B, R) \
1140 ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \
1141 (__v8sf)(__m256)(B), (int)(R)))
1142
1143#define _mm256_mask_div_round_ps(W, U, A, B, R) \
1144 ((__m256)__builtin_ia32_selectps_256( \
1145 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1146 (__v8sf)(__m256)(W)))
1147
1148#define _mm256_maskz_div_round_ps(U, A, B, R) \
1149 ((__m256)__builtin_ia32_selectps_256( \
1150 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \
1151 (__v8sf)_mm256_setzero_ps()))
1152
1153#define _mm256_fcmadd_round_pch(A, B, C, R) \
1154 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1155 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1156 (__mmask8)-1, (int)(R)))
1157
1158#define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \
1159 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \
1160 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1161 (__mmask8)(U), (int)(R)))
1162
1163#define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \
1164 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \
1165 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1166 (__mmask8)(U), (int)(R)))
1167
1168#define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \
1169 ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \
1170 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1171 (__mmask8)(U), (int)(R)))
1172
1173#define _mm256_cmul_round_pch(A, B, R) \
1174 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1175 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1176 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1177
1178#define _mm256_mask_cmul_round_pch(W, U, A, B, R) \
1179 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1180 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1181 (__mmask8)(U), (int)(R)))
1182
1183#define _mm256_maskz_cmul_round_pch(U, A, B, R) \
1184 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \
1185 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1186 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1187
1188#define _mm256_fixupimm_round_pd(A, B, C, imm, R) \
1189 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1190 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1191 (int)(imm), (__mmask8)-1, (int)(R)))
1192
1193#define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
1194 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \
1195 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1196 (int)(imm), (__mmask8)(U), (int)(R)))
1197
1198#define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
1199 ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \
1200 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \
1201 (int)(imm), (__mmask8)(U), (int)(R)))
1202
1203#define _mm256_fixupimm_round_ps(A, B, C, imm, R) \
1204 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1205 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1206 (int)(imm), (__mmask8)-1, (int)(R)))
1207
1208#define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
1209 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \
1210 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1211 (int)(imm), (__mmask8)(U), (int)(R)))
1212
1213#define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
1214 ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \
1215 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \
1216 (int)(imm), (__mmask8)(U), (int)(R)))
1217
1218#define _mm256_fmadd_round_pd(A, B, C, R) \
1219 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1220 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1221 (__mmask8)-1, (int)(R)))
1222
1223#define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \
1224 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1225 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1226 (__mmask8)(U), (int)(R)))
1227
1228#define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \
1229 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1230 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1231 (__mmask8)(U), (int)(R)))
1232
1233#define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \
1234 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1235 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1236 (__mmask8)(U), (int)(R)))
1237
1238#define _mm256_fmsub_round_pd(A, B, C, R) \
1239 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1240 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1241 (__mmask8)-1, (int)(R)))
1242
1243#define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \
1244 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1245 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1246 (__mmask8)(U), (int)(R)))
1247
1248#define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \
1249 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1250 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1251 (__mmask8)(U), (int)(R)))
1252
1253#define _mm256_fnmadd_round_pd(A, B, C, R) \
1254 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1255 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1256 (__mmask8)-1, (int)(R)))
1257
1258#define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \
1259 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \
1260 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1261 (__mmask8)(U), (int)(R)))
1262
1263#define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \
1264 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1265 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1266 (__mmask8)(U), (int)(R)))
1267
1268#define _mm256_fnmsub_round_pd(A, B, C, R) \
1269 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1270 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1271 (__mmask8)-1, (int)(R)))
1272
1273#define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \
1274 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \
1275 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1276 (__mmask8)(U), (int)(R)))
1277
1278#define _mm256_fmadd_round_ph(A, B, C, R) \
1279 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1280 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1281 (__mmask16)-1, (int)(R)))
1282
1283#define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \
1284 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1285 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1286 (__mmask16)(U), (int)(R)))
1287
1288#define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \
1289 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1290 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1291 (__mmask16)(U), (int)(R)))
1292
1293#define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \
1294 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1295 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1296 (__mmask16)(U), (int)(R)))
1297
1298#define _mm256_fmsub_round_ph(A, B, C, R) \
1299 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1300 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1301 (__mmask16)-1, (int)(R)))
1302
1303#define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \
1304 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1305 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1306 (__mmask16)(U), (int)(R)))
1307
1308#define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \
1309 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1310 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1311 (__mmask16)(U), (int)(R)))
1312
1313#define _mm256_fnmadd_round_ph(A, B, C, R) \
1314 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1315 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1316 (__mmask16)-1, (int)(R)))
1317
1318#define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \
1319 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \
1320 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1321 (__mmask16)(U), (int)(R)))
1322
1323#define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \
1324 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1325 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1326 (__mmask16)(U), (int)(R)))
1327
1328#define _mm256_fnmsub_round_ph(A, B, C, R) \
1329 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1330 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1331 (__mmask16)-1, (int)(R)))
1332
1333#define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \
1334 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \
1335 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1336 (__mmask16)(U), (int)(R)))
1337
1338#define _mm256_fmadd_round_ps(A, B, C, R) \
1339 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1340 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1341 (__mmask8)-1, (int)(R)))
1342
1343#define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \
1344 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1345 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1346 (__mmask8)(U), (int)(R)))
1347
1348#define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \
1349 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1350 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1351 (__mmask8)(U), (int)(R)))
1352
1353#define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \
1354 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1355 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1356 (__mmask8)(U), (int)(R)))
1357
1358#define _mm256_fmsub_round_ps(A, B, C, R) \
1359 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1360 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1361 (__mmask8)-1, (int)(R)))
1362
1363#define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \
1364 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1365 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1366 (__mmask8)(U), (int)(R)))
1367
1368#define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \
1369 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1370 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1371 (__mmask8)(U), (int)(R)))
1372
1373#define _mm256_fnmadd_round_ps(A, B, C, R) \
1374 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1375 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1376 (__mmask8)-1, (int)(R)))
1377
1378#define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \
1379 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \
1380 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1381 (__mmask8)(U), (int)(R)))
1382
1383#define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \
1384 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1385 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1386 (__mmask8)(U), (int)(R)))
1387
1388#define _mm256_fnmsub_round_ps(A, B, C, R) \
1389 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1390 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1391 (__mmask8)-1, (int)(R)))
1392
1393#define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \
1394 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \
1395 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1396 (__mmask8)(U), (int)(R)))
1397
1398#define _mm256_fmadd_round_pch(A, B, C, R) \
1399 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1400 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1401 (__mmask8)-1, (int)(R)))
1402
1403#define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \
1404 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \
1405 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1406 (__mmask8)(U), (int)(R)))
1407
1408#define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \
1409 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \
1410 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1411 (__mmask8)(U), (int)(R)))
1412
1413#define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \
1414 ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \
1415 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \
1416 (__mmask8)(U), (int)(R)))
1417
1418#define _mm256_fmaddsub_round_pd(A, B, C, R) \
1419 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1420 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1421 (__mmask8)-1, (int)(R)))
1422
1423#define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \
1424 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1425 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1426 (__mmask8)(U), (int)(R)))
1427
1428#define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \
1429 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \
1430 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1431 (__mmask8)(U), (int)(R)))
1432
1433#define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \
1434 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1435 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1436 (__mmask8)(U), (int)(R)))
1437
1438#define _mm256_fmsubadd_round_pd(A, B, C, R) \
1439 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1440 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1441 (__mmask8)-1, (int)(R)))
1442
1443#define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \
1444 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \
1445 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1446 (__mmask8)(U), (int)(R)))
1447
1448#define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \
1449 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \
1450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1451 (__mmask8)(U), (int)(R)))
1452
1453#define _mm256_fmaddsub_round_ph(A, B, C, R) \
1454 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1455 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1456 (__mmask16)-1, (int)(R)))
1457
1458#define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \
1459 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1461 (__mmask16)(U), (int)(R)))
1462
1463#define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \
1464 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \
1465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1466 (__mmask16)(U), (int)(R)))
1467
1468#define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \
1469 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1470 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1471 (__mmask16)(U), (int)(R)))
1472
1473#define _mm256_fmsubadd_round_ph(A, B, C, R) \
1474 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1475 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1476 (__mmask16)-1, (int)(R)))
1477
1478#define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \
1479 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \
1480 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1481 (__mmask16)(U), (int)(R)))
1482
1483#define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \
1484 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \
1485 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1486 (__mmask16)(U), (int)(R)))
1487
1488#define _mm256_fmaddsub_round_ps(A, B, C, R) \
1489 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1490 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1491 (__mmask8)-1, (int)(R)))
1492
1493#define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \
1494 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1495 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1496 (__mmask8)(U), (int)(R)))
1497
1498#define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \
1499 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \
1500 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1501 (__mmask8)(U), (int)(R)))
1502
1503#define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \
1504 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1505 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1506 (__mmask8)(U), (int)(R)))
1507
1508#define _mm256_fmsubadd_round_ps(A, B, C, R) \
1509 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1510 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1511 (__mmask8)-1, (int)(R)))
1512
1513#define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \
1514 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \
1515 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1516 (__mmask8)(U), (int)(R)))
1517
1518#define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \
1519 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \
1520 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1521 (__mmask8)(U), (int)(R)))
1522#define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \
1523 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1524 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1525 (__mmask8)(U), (int)(R)))
1526
1527#define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \
1528 ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \
1529 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1530 (__mmask8)(U), (int)(R)))
1531
1532#define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \
1533 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1534 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1535 (__mmask8)(U), (int)(R)))
1536
1537#define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \
1538 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \
1539 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \
1540 (__mmask8)(U), (int)(R)))
1541
1542#define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \
1543 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \
1544 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \
1545 (__mmask8)(U), (int)(R)))
1546
1547#define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \
1548 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1549 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1550 (__mmask16)(U), (int)(R)))
1551
1552#define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \
1553 ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \
1554 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1555 (__mmask16)(U), (int)(R)))
1556
1557#define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \
1558 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1559 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1560 (__mmask16)(U), (int)(R)))
1561
1562#define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \
1563 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \
1564 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \
1565 (__mmask16)(U), (int)(R)))
1566
1567#define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \
1568 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \
1569 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \
1570 (__mmask16)(U), (int)(R)))
1571
1572#define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \
1573 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1574 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1575 (__mmask8)(U), (int)(R)))
1576
1577#define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \
1578 ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \
1579 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1580 (__mmask8)(U), (int)(R)))
1581
1582#define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \
1583 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1584 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1585 (__mmask8)(U), (int)(R)))
1586
1587#define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \
1588 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \
1589 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \
1590 (__mmask8)(U), (int)(R)))
1591
1592#define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \
1593 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \
1594 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \
1595 (__mmask8)(U), (int)(R)))
1596
1597#define _mm256_mul_round_pch(A, B, R) \
1598 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1599 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1600 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R)))
1601
1602#define _mm256_mask_mul_round_pch(W, U, A, B, R) \
1603 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1604 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \
1605 (__mmask8)(U), (int)(R)))
1606
1607#define _mm256_maskz_mul_round_pch(U, A, B, R) \
1608 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \
1609 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \
1610 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R)))
1611
1612#define _mm256_getexp_round_pd(A, R) \
1613 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1614 (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \
1615 (int)(R)))
1616
1617#define _mm256_mask_getexp_round_pd(W, U, A, R) \
1618 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1619 (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1620
1621#define _mm256_maskz_getexp_round_pd(U, A, R) \
1622 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \
1623 (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \
1624 (int)(R)))
1625
1626#define _mm256_getexp_round_ph(A, R) \
1627 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1628 (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \
1629 (int)(R)))
1630
1631#define _mm256_mask_getexp_round_ph(W, U, A, R) \
1632 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1633 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
1634
1635#define _mm256_maskz_getexp_round_ph(U, A, R) \
1636 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \
1637 (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \
1638 (int)(R)))
1639
1640#define _mm256_getexp_round_ps(A, R) \
1641 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1642 (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \
1643 (int)(R)))
1644
1645#define _mm256_mask_getexp_round_ps(W, U, A, R) \
1646 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \
1647 (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R)))
1648
1649#define _mm256_maskz_getexp_round_ps(U, A, R) \
1650 ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \
1651 (__v8sf)_mm256_setzero_ps(), \
1652 (__mmask8)(U), (int)(R)))
1653
1654#define _mm256_getmant_round_pd(A, B, C, R) \
1655 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1656 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1657 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1658
1659#define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \
1660 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1661 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \
1662 (__mmask8)(U), (int)(R)))
1663
1664#define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \
1665 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \
1666 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \
1667 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1668
1669#define _mm256_getmant_round_ph(A, B, C, R) \
1670 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1671 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1672 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1673
1674#define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \
1675 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1676 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
1677 (__mmask16)(U), (int)(R)))
1678
1679#define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \
1680 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \
1681 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
1682 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1683
1684#define _mm256_getmant_round_ps(A, B, C, R) \
1685 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1686 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1687 (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
1688
1689#define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \
1690 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1691 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \
1692 (__mmask8)(U), (int)(R)))
1693
1694#define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \
1695 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \
1696 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \
1697 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1698
1699#define _mm256_max_round_pd(A, B, R) \
1700 ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \
1701 (__v4df)(__m256d)(B), (int)(R)))
1702
1703#define _mm256_mask_max_round_pd(W, U, A, B, R) \
1704 ((__m256d)__builtin_ia32_selectpd_256( \
1705 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1706 (__v4df)(__m256d)(W)))
1707
1708#define _mm256_maskz_max_round_pd(U, A, B, R) \
1709 ((__m256d)__builtin_ia32_selectpd_256( \
1710 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \
1711 (__v4df)_mm256_setzero_pd()))
1712
1713#define _mm256_max_round_ph(A, B, R) \
1714 ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \
1715 (__v16hf)(__m256h)(B), (int)(R)))
1716
1717#define _mm256_mask_max_round_ph(W, U, A, B, R) \
1718 ((__m256h)__builtin_ia32_selectph_256( \
1719 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1720 (__v16hf)(__m256h)(W)))
1721
1722#define _mm256_maskz_max_round_ph(U, A, B, R) \
1723 ((__m256h)__builtin_ia32_selectph_256( \
1724 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \
1725 (__v16hf)_mm256_setzero_ph()))
1726
1727#define _mm256_max_round_ps(A, B, R) \
1728 ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \
1729 (__v8sf)(__m256)(B), (int)(R)))
1730
1731#define _mm256_mask_max_round_ps(W, U, A, B, R) \
1732 ((__m256)__builtin_ia32_selectps_256( \
1733 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1734 (__v8sf)(__m256)(W)))
1735
1736#define _mm256_maskz_max_round_ps(U, A, B, R) \
1737 ((__m256)__builtin_ia32_selectps_256( \
1738 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \
1739 (__v8sf)_mm256_setzero_ps()))
1740
1741#define _mm256_min_round_pd(A, B, R) \
1742 ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \
1743 (__v4df)(__m256d)(B), (int)(R)))
1744
1745#define _mm256_mask_min_round_pd(W, U, A, B, R) \
1746 ((__m256d)__builtin_ia32_selectpd_256( \
1747 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1748 (__v4df)(__m256d)(W)))
1749
1750#define _mm256_maskz_min_round_pd(U, A, B, R) \
1751 ((__m256d)__builtin_ia32_selectpd_256( \
1752 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \
1753 (__v4df)_mm256_setzero_pd()))
1754
1755#define _mm256_min_round_ph(A, B, R) \
1756 ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \
1757 (__v16hf)(__m256h)(B), (int)(R)))
1758
1759#define _mm256_mask_min_round_ph(W, U, A, B, R) \
1760 ((__m256h)__builtin_ia32_selectph_256( \
1761 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1762 (__v16hf)(__m256h)(W)))
1763
1764#define _mm256_maskz_min_round_ph(U, A, B, R) \
1765 ((__m256h)__builtin_ia32_selectph_256( \
1766 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \
1767 (__v16hf)_mm256_setzero_ph()))
1768
1769#define _mm256_min_round_ps(A, B, R) \
1770 ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \
1771 (__v8sf)(__m256)(B), (int)(R)))
1772
1773#define _mm256_mask_min_round_ps(W, U, A, B, R) \
1774 ((__m256)__builtin_ia32_selectps_256( \
1775 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1776 (__v8sf)(__m256)(W)))
1777
1778#define _mm256_maskz_min_round_ps(U, A, B, R) \
1779 ((__m256)__builtin_ia32_selectps_256( \
1780 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \
1781 (__v8sf)_mm256_setzero_ps()))
1782
1783#define _mm256_mul_round_pd(A, B, R) \
1784 ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \
1785 (__v4df)(__m256d)(B), (int)(R)))
1786
1787#define _mm256_mask_mul_round_pd(W, U, A, B, R) \
1788 ((__m256d)__builtin_ia32_selectpd_256( \
1789 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1790 (__v4df)(__m256d)(W)))
1791
1792#define _mm256_maskz_mul_round_pd(U, A, B, R) \
1793 ((__m256d)__builtin_ia32_selectpd_256( \
1794 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \
1795 (__v4df)_mm256_setzero_pd()))
1796
1797#define _mm256_mul_round_ph(A, B, R) \
1798 ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \
1799 (__v16hf)(__m256h)(B), (int)(R)))
1800
1801#define _mm256_mask_mul_round_ph(W, U, A, B, R) \
1802 ((__m256h)__builtin_ia32_selectph_256( \
1803 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1804 (__v16hf)(__m256h)(W)))
1805
1806#define _mm256_maskz_mul_round_ph(U, A, B, R) \
1807 ((__m256h)__builtin_ia32_selectph_256( \
1808 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \
1809 (__v16hf)_mm256_setzero_ph()))
1810
1811#define _mm256_mul_round_ps(A, B, R) \
1812 ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \
1813 (__v8sf)(__m256)(B), (int)(R)))
1814
1815#define _mm256_mask_mul_round_ps(W, U, A, B, R) \
1816 ((__m256)__builtin_ia32_selectps_256( \
1817 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1818 (__v8sf)(__m256)(W)))
1819
1820#define _mm256_maskz_mul_round_ps(U, A, B, R) \
1821 ((__m256)__builtin_ia32_selectps_256( \
1822 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \
1823 (__v8sf)_mm256_setzero_ps()))
1824
1825#define _mm256_range_round_pd(A, B, C, R) \
1826 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1827 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1828 (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R)))
1829
1830#define _mm256_mask_range_round_pd(W, U, A, B, C, R) \
1831 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1832 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1833 (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
1834
1835#define _mm256_maskz_range_round_pd(U, A, B, C, R) \
1836 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \
1837 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
1838 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
1839
1840#define _mm256_range_round_ps(A, B, C, R) \
1841 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1842 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1843 (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R)))
1844
1845#define _mm256_mask_range_round_ps(W, U, A, B, C, R) \
1846 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1847 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
1848 (__mmask8)(U), (int)(R)))
1849
1850#define _mm256_maskz_range_round_ps(U, A, B, C, R) \
1851 ((__m256)__builtin_ia32_vrangeps256_round_mask( \
1852 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
1853 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
1854
1855#define _mm256_reduce_round_pd(A, B, R) \
1856 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1857 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1858 (__mmask8)-1, (int)(R)))
1859
1860#define _mm256_mask_reduce_round_pd(W, U, A, B, R) \
1861 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1862 (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \
1863 (int)(R)))
1864
1865#define _mm256_maskz_reduce_round_pd(U, A, B, R) \
1866 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \
1867 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \
1868 (__mmask8)(U), (int)(R)))
1869
1870#define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \
1871 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1872 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
1873 (__mmask16)(U), (int)(R)))
1874
1875#define _mm256_maskz_reduce_round_ph(U, A, imm, R) \
1876 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1877 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1878 (__mmask16)(U), (int)(R)))
1879
1880#define _mm256_reduce_round_ph(A, imm, R) \
1881 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \
1882 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1883 (__mmask16)-1, (int)(R)))
1884
1885#define _mm256_reduce_round_ps(A, B, R) \
1886 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1887 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1888 (__mmask8)-1, (int)(R)))
1889
1890#define _mm256_mask_reduce_round_ps(W, U, A, B, R) \
1891 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1892 (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \
1893 (int)(R)))
1894
1895#define _mm256_maskz_reduce_round_ps(U, A, B, R) \
1896 ((__m256)__builtin_ia32_vreduceps256_round_mask( \
1897 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \
1898 (__mmask8)(U), (int)(R)))
1899
1900#define _mm256_roundscale_round_pd(A, imm, R) \
1901 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1902 (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \
1903 (__mmask8)-1, (int)(R)))
1904
1905#define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \
1906 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1907 (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \
1908 (int)(R)))
1909
1910#define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \
1911 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \
1912 (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \
1913 (__mmask8)(A), (int)(R)))
1914
1915#define _mm256_roundscale_round_ph(A, imm, R) \
1916 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1917 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \
1918 (__mmask16)-1, (int)(R)))
1919
1920#define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \
1921 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1922 (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \
1923 (__mmask16)(B), (int)(R)))
1924
1925#define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \
1926 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \
1927 (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
1928 (__mmask16)(A), (int)(R)))
1929
1930#define _mm256_roundscale_round_ps(A, imm, R) \
1931 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1932 (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \
1933 (__mmask8)-1, (int)(R)))
1934
1935#define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \
1936 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1937 (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \
1938 (int)(R)))
1939
1940#define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \
1941 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \
1942 (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \
1943 (__mmask8)(A), (int)(R)))
1944
1945#define _mm256_scalef_round_pd(A, B, R) \
1946 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1947 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \
1948 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
1949
1950#define _mm256_mask_scalef_round_pd(W, U, A, B, R) \
1951 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1952 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \
1953 (__mmask8)(U), (int)(R)))
1954
1955#define _mm256_maskz_scalef_round_pd(U, A, B, R) \
1956 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \
1957 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \
1958 (__mmask8)(U), (int)(R)))
1959
1960#define _mm256_scalef_round_ph(A, B, R) \
1961 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1962 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1963 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
1964
1965#define _mm256_mask_scalef_round_ph(W, U, A, B, R) \
1966 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1967 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \
1968 (__mmask16)(U), (int)(R)))
1969
1970#define _mm256_maskz_scalef_round_ph(U, A, B, R) \
1971 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \
1972 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \
1973 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1974
1975#define _mm256_scalef_round_ps(A, B, R) \
1976 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1977 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \
1978 (__mmask8)-1, (int)(R)))
1979
1980#define _mm256_mask_scalef_round_ps(W, U, A, B, R) \
1981 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1982 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \
1983 (__mmask8)(U), (int)(R)))
1984
1985#define _mm256_maskz_scalef_round_ps(U, A, B, R) \
1986 ((__m256)__builtin_ia32_vscalefps256_round_mask( \
1987 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \
1988 (__mmask8)(U), (int)(R)))
1989
1990#define _mm256_sqrt_round_pd(A, R) \
1991 ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R)))
1992
1993#define _mm256_mask_sqrt_round_pd(W, U, A, R) \
1994 ((__m256d)__builtin_ia32_selectpd_256( \
1995 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
1996 (__v4df)(__m256d)(W)))
1997
1998#define _mm256_maskz_sqrt_round_pd(U, A, R) \
1999 ((__m256d)__builtin_ia32_selectpd_256( \
2000 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \
2001 (__v4df)_mm256_setzero_pd()))
2002
2003#define _mm256_sqrt_round_ph(A, R) \
2004 ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R)))
2005
2006#define _mm256_mask_sqrt_round_ph(W, U, A, R) \
2007 ((__m256h)__builtin_ia32_selectph_256( \
2008 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2009 (__v16hf)(__m256h)(W)))
2010
2011#define _mm256_maskz_sqrt_round_ph(U, A, R) \
2012 ((__m256h)__builtin_ia32_selectph_256( \
2013 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \
2014 (__v16hf)_mm256_setzero_ph()))
2015
2016#define _mm256_sqrt_round_ps(A, R) \
2017 ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R)))
2018
2019#define _mm256_mask_sqrt_round_ps(W, U, A, R) \
2020 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2021 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2022 (__v8sf)(__m256)(W)))
2023
2024#define _mm256_maskz_sqrt_round_ps(U, A, R) \
2025 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
2026 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \
2027 (__v8sf)_mm256_setzero_ps()))
2028
2029#define _mm256_sub_round_pd(A, B, R) \
2030 ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \
2031 (__v4df)(__m256d)(B), (int)(R)))
2032
2033#define _mm256_mask_sub_round_pd(W, U, A, B, R) \
2034 ((__m256d)__builtin_ia32_selectpd_256( \
2035 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2036 (__v4df)(__m256d)(W)))
2037
2038#define _mm256_maskz_sub_round_pd(U, A, B, R) \
2039 ((__m256d)__builtin_ia32_selectpd_256( \
2040 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \
2041 (__v4df)_mm256_setzero_pd()))
2042
2043#define _mm256_sub_round_ph(A, B, R) \
2044 ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \
2045 (__v16hf)(__m256h)(B), (int)(R)))
2046
2047#define _mm256_mask_sub_round_ph(W, U, A, B, R) \
2048 ((__m256h)__builtin_ia32_selectph_256( \
2049 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2050 (__v16hf)(__m256h)(W)))
2051
2052#define _mm256_maskz_sub_round_ph(U, A, B, R) \
2053 ((__m256h)__builtin_ia32_selectph_256( \
2054 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \
2055 (__v16hf)_mm256_setzero_ph()))
2056
2057#define _mm256_sub_round_ps(A, B, R) \
2058 ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \
2059 (__v8sf)(__m256)(B), (int)(R)))
2060
2061#define _mm256_mask_sub_round_ps(W, U, A, B, R) \
2062 ((__m256)__builtin_ia32_selectps_256( \
2063 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2064 (__v8sf)(__m256)(W)))
2065
2066#define _mm256_maskz_sub_round_ps(U, A, B, R) \
2067 ((__m256)__builtin_ia32_selectps_256( \
2068 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \
2069 (__v8sf)_mm256_setzero_ps()))
2070
2071#undef __DEFAULT_FN_ATTRS256
2072#undef __DEFAULT_FN_ATTRS128
2073
2074#endif /* __AVX10_2NIINTRIN_H */
2075#endif /* __SSE2__ */
#define __DEFAULT_FN_ATTRS256
Definition: avx2intrin.h:18
#define __DEFAULT_FN_ATTRS128
Definition: avx2intrin.h:21
unsigned char __mmask8
Definition: avx512fintrin.h:41
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
Definition: avxintrin.h:4340
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition: avxintrin.h:4353
#define _mm256_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm256_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwsud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm_dpwusds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpwsuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of signed 16-bit integers in __A with corresponding unsigned 16-b...
#define _mm256_dpwusd_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding signed 16-b...
#define _mm_dpwuuds_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm256_dpwuud_epi32(__W, __A, __B)
Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in __A with corresponding unsigned 16...
#define _mm_dpbuuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbuuds_epi32(__W, __A, __B)
corresponding unsigned 8-bit integers in __B, producing 4 intermediate signed 16-bit results.
#define _mm256_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm256_dpbsud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm256_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssd_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm_dpbssds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding signed 8-bit i...
#define _mm256_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
#define _mm_dpbuud_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in __A with corresponding unsigned 8-b...
#define _mm_dpbsuds_epi32(__W, __A, __B)
Multiply groups of 4 adjacent pairs of signed 8-bit integers in __A with corresponding unsigned 8-bit...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3859
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2033