clang 23.0.0git
avx512fp16intrin.h
Go to the documentation of this file.
1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23/* Define the default attributes for the functions in this file. */
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
33
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
38#else
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
42#endif
43
45_mm512_cvtsh_h(__m512h __a) {
46 return __a[0];
47}
48
49static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setzero_ph(void) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
51}
52
53static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
54_mm256_setzero_ph(void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57}
58
59static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
60 return (__m256h)__builtin_ia32_undef256();
61}
62
63static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
64_mm512_setzero_ph(void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
68}
69
70static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
71 return (__m128h)__builtin_ia32_undef128();
72}
73
74static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
75 return (__m512h)__builtin_ia32_undef512();
76}
77
78static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
79_mm512_set1_ph(_Float16 __h) {
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
84}
85
86static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
87_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
88 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
89 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
90 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
91 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
92 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
93 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
94 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
100}
101
102static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setr_ph(
103 _Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4,
104 _Float16 e5, _Float16 e6, _Float16 e7, _Float16 e8, _Float16 e9,
105 _Float16 e10, _Float16 e11, _Float16 e12, _Float16 e13, _Float16 e14,
106 _Float16 e15, _Float16 e16, _Float16 e17, _Float16 e18, _Float16 e19,
107 _Float16 e20, _Float16 e21, _Float16 e22, _Float16 e23, _Float16 e24,
108 _Float16 e25, _Float16 e26, _Float16 e27, _Float16 e28, _Float16 e29,
109 _Float16 e30, _Float16 e31) {
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
113}
114
115static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
116_mm512_set1_pch(_Float16 _Complex __h) {
117 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
118}
119
120static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castph_ps(__m128h __a) {
121 return (__m128)__a;
122}
123
124static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_castph_ps(__m256h __a) {
125 return (__m256)__a;
126}
127
128static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_castph_ps(__m512h __a) {
129 return (__m512)__a;
130}
131
132static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castph_pd(__m128h __a) {
133 return (__m128d)__a;
134}
135
136static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_castph_pd(__m256h __a) {
137 return (__m256d)__a;
138}
139
140static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_castph_pd(__m512h __a) {
141 return (__m512d)__a;
142}
143
144static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castph_si128(__m128h __a) {
145 return (__m128i)__a;
146}
147
148static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
149_mm256_castph_si256(__m256h __a) {
150 return (__m256i)__a;
151}
152
153static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
154_mm512_castph_si512(__m512h __a) {
155 return (__m512i)__a;
156}
157
158static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castps_ph(__m128 __a) {
159 return (__m128h)__a;
160}
161
162static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_castps_ph(__m256 __a) {
163 return (__m256h)__a;
164}
165
166static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_castps_ph(__m512 __a) {
167 return (__m512h)__a;
168}
169
170static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castpd_ph(__m128d __a) {
171 return (__m128h)__a;
172}
173
174static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_castpd_ph(__m256d __a) {
175 return (__m256h)__a;
176}
177
178static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_castpd_ph(__m512d __a) {
179 return (__m512h)__a;
180}
181
182static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_castsi128_ph(__m128i __a) {
183 return (__m128h)__a;
184}
185
186static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
187_mm256_castsi256_ph(__m256i __a) {
188 return (__m256h)__a;
189}
190
191static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
192_mm512_castsi512_ph(__m512i __a) {
193 return (__m512h)__a;
194}
195
196static __inline__ __m128h __DEFAULT_FN_ATTRS256_CONSTEXPR
197_mm256_castph256_ph128(__m256h __a) {
198 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
199}
200
201static __inline__ __m128h __DEFAULT_FN_ATTRS512_CONSTEXPR
202_mm512_castph512_ph128(__m512h __a) {
203 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
204}
205
206static __inline__ __m256h __DEFAULT_FN_ATTRS512_CONSTEXPR
207_mm512_castph512_ph256(__m512h __a) {
208 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209 12, 13, 14, 15);
210}
211
212static __inline__ __m256h __DEFAULT_FN_ATTRS256
213_mm256_castph128_ph256(__m128h __a) {
214 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
216}
217
218static __inline__ __m512h __DEFAULT_FN_ATTRS512
219_mm512_castph128_ph512(__m128h __a) {
220 __m256h __b = __builtin_nondeterministic_value(__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
226}
227
228static __inline__ __m512h __DEFAULT_FN_ATTRS512
229_mm512_castph256_ph512(__m256h __a) {
230 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
233 27, 28, 29, 30, 31);
234}
235
236/// Constructs a 256-bit floating-point vector of [16 x half] from a
237/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
238/// contain the value of the source vector. The upper 384 bits are set
239/// to zero.
240///
241/// \headerfile <x86intrin.h>
242///
243/// This intrinsic has no corresponding instruction.
244///
245/// \param __a
246/// A 128-bit vector of [8 x half].
247/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
248/// contain the value of the parameter. The upper 384 bits are set to zero.
249static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
250_mm256_zextph128_ph256(__m128h __a) {
251 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
253}
254
255/// Constructs a 512-bit floating-point vector of [32 x half] from a
256/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
257/// contain the value of the source vector. The upper 384 bits are set
258/// to zero.
259///
260/// \headerfile <x86intrin.h>
261///
262/// This intrinsic has no corresponding instruction.
263///
264/// \param __a
265/// A 128-bit vector of [8 x half].
266/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
267/// contain the value of the parameter. The upper 384 bits are set to zero.
268static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
269_mm512_zextph128_ph512(__m128h __a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
273}
274
275/// Constructs a 512-bit floating-point vector of [32 x half] from a
276/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
277/// contain the value of the source vector. The upper 256 bits are set
278/// to zero.
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic has no corresponding instruction.
283///
284/// \param __a
285/// A 256-bit vector of [16 x half].
286/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
287/// contain the value of the parameter. The upper 256 bits are set to zero.
288static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
289_mm512_zextph256_ph512(__m256h __a) {
290 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
293 29, 30, 31);
294}
295
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
298
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
301
302static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
303 __m128h __B) {
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
306}
307
308static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
309 __m128h __B) {
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
312}
313
314static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
315 __m128h __B) {
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
318}
319
320static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
321 __m128h __B) {
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
324}
325
326static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
327 __m128h __B) {
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
330}
331
332static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
333 __m128h __B) {
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
336}
337
338static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
339 __m128h __B) {
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
342}
343
344static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
345 __m128h __B) {
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
348}
349
350static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
351 __m128h __B) {
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
354}
355
356static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
357 __m128h __B) {
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
360}
361
362static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
363 __m128h __B) {
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
366}
367
368static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
369 __m128h __B) {
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
372}
373
374static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
375 __m512h __B) {
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
377}
378
379static __inline__ __m512h __DEFAULT_FN_ATTRS512
380_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
383}
384
385static __inline__ __m512h __DEFAULT_FN_ATTRS512
386_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
390}
391
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
395
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
400
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
405
406static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
407 __m512h __B) {
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
409}
410
411static __inline__ __m512h __DEFAULT_FN_ATTRS512
412_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
415}
416
417static __inline__ __m512h __DEFAULT_FN_ATTRS512
418_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
422}
423
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
427
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
432
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
437
438static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
439 __m512h __B) {
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
441}
442
443static __inline__ __m512h __DEFAULT_FN_ATTRS512
444_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
447}
448
449static __inline__ __m512h __DEFAULT_FN_ATTRS512
450_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
454}
455
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
459
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
464
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
469
470static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
471 __m512h __B) {
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
473}
474
475static __inline__ __m512h __DEFAULT_FN_ATTRS512
476_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
479}
480
481static __inline__ __m512h __DEFAULT_FN_ATTRS512
482_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
486}
487
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
491
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
496
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
501
502static __inline__ __m512h
503 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_min_ph(__m512h __A, __m512h __B) {
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
506}
507
508static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
509_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
512}
513
514static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
515_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
519}
520
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
524
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
529
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
534
535static __inline__ __m512h
536 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_max_ph(__m512h __A, __m512h __B) {
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
539}
540
541static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
542_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
545}
546
547static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
548_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
552}
553
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
557
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
562
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
567
568static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
569_mm512_abs_ph(__m512h __A) {
570 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
571}
572
573static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
574 return (__m512h)_mm512_xor_epi32((__m512i)__A,
575 _mm512_set1_epi32(-2147483648));
576}
577
578static __inline__ __m512h __DEFAULT_FN_ATTRS512
579_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
582}
583
584static __inline__ __m512h __DEFAULT_FN_ATTRS512
585_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
588 (__v16sf)_mm512_setzero_ps());
589}
590
591static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
592_mm_add_sh(__m128h __A, __m128h __B) {
593 __A[0] += __B[0];
594 return __A;
595}
596
597static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
598_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
599 __A = _mm_add_sh(__A, __B);
600 return __builtin_ia32_selectsh_128(__U, __A, __W);
601}
602
603static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
604_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) {
605 __A = _mm_add_sh(__A, __B);
606 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
607}
608
609#define _mm_add_round_sh(A, B, R) \
610 ((__m128h)__builtin_ia32_addsh_round_mask( \
611 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
612 (__mmask8)-1, (int)(R)))
613
614#define _mm_mask_add_round_sh(W, U, A, B, R) \
615 ((__m128h)__builtin_ia32_addsh_round_mask( \
616 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
617 (__mmask8)(U), (int)(R)))
618
619#define _mm_maskz_add_round_sh(U, A, B, R) \
620 ((__m128h)__builtin_ia32_addsh_round_mask( \
621 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
622 (__mmask8)(U), (int)(R)))
623
624static __inline__ __m128h
625 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sub_sh(__m128h __A, __m128h __B) {
626 __A[0] -= __B[0];
627 return __A;
628}
629
630static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
631_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
632 __A = _mm_sub_sh(__A, __B);
633 return __builtin_ia32_selectsh_128(__U, __A, __W);
634}
635
636static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
637_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) {
638 __A = _mm_sub_sh(__A, __B);
639 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
640}
641
642#define _mm_sub_round_sh(A, B, R) \
643 ((__m128h)__builtin_ia32_subsh_round_mask( \
644 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
645 (__mmask8)-1, (int)(R)))
646
647#define _mm_mask_sub_round_sh(W, U, A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
650 (__mmask8)(U), (int)(R)))
651
652#define _mm_maskz_sub_round_sh(U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
655 (__mmask8)(U), (int)(R)))
656
657static __inline__ __m128h
658 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mul_sh(__m128h __A, __m128h __B) {
659 __A[0] *= __B[0];
660 return __A;
661}
662
663static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
664_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
665 __A = _mm_mul_sh(__A, __B);
666 return __builtin_ia32_selectsh_128(__U, __A, __W);
667}
668
669static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
670_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) {
671 __A = _mm_mul_sh(__A, __B);
672 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
673}
674
675#define _mm_mul_round_sh(A, B, R) \
676 ((__m128h)__builtin_ia32_mulsh_round_mask( \
677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678 (__mmask8)-1, (int)(R)))
679
680#define _mm_mask_mul_round_sh(W, U, A, B, R) \
681 ((__m128h)__builtin_ia32_mulsh_round_mask( \
682 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
683 (__mmask8)(U), (int)(R)))
684
685#define _mm_maskz_mul_round_sh(U, A, B, R) \
686 ((__m128h)__builtin_ia32_mulsh_round_mask( \
687 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
688 (__mmask8)(U), (int)(R)))
689
690static __inline__ __m128h
691 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_div_sh(__m128h __A, __m128h __B) {
692 __A[0] /= __B[0];
693 return __A;
694}
695
696static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
697_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
698 __A = _mm_div_sh(__A, __B);
699 return __builtin_ia32_selectsh_128(__U, __A, __W);
700}
701
702static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
703_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) {
704 __A = _mm_div_sh(__A, __B);
705 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
706}
707
708#define _mm_div_round_sh(A, B, R) \
709 ((__m128h)__builtin_ia32_divsh_round_mask( \
710 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
711 (__mmask8)-1, (int)(R)))
712
713#define _mm_mask_div_round_sh(W, U, A, B, R) \
714 ((__m128h)__builtin_ia32_divsh_round_mask( \
715 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
716 (__mmask8)(U), (int)(R)))
717
718#define _mm_maskz_div_round_sh(U, A, B, R) \
719 ((__m128h)__builtin_ia32_divsh_round_mask( \
720 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
721 (__mmask8)(U), (int)(R)))
722
723static __inline__ __m128h
724 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_min_sh(__m128h __A, __m128h __B) {
725 return (__m128h)__builtin_ia32_minsh_round_mask(
726 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
728}
729
730static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
731_mm_mask_min_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
732 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
733 (__v8hf)__W, (__mmask8)__U,
735}
736
737static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
738_mm_maskz_min_sh(__mmask8 __U, __m128h __A, __m128h __B) {
739 return (__m128h)__builtin_ia32_minsh_round_mask(
740 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
742}
743
744#define _mm_min_round_sh(A, B, R) \
745 ((__m128h)__builtin_ia32_minsh_round_mask( \
746 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
747 (__mmask8)-1, (int)(R)))
748
749#define _mm_mask_min_round_sh(W, U, A, B, R) \
750 ((__m128h)__builtin_ia32_minsh_round_mask( \
751 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
752 (__mmask8)(U), (int)(R)))
753
754#define _mm_maskz_min_round_sh(U, A, B, R) \
755 ((__m128h)__builtin_ia32_minsh_round_mask( \
756 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
757 (__mmask8)(U), (int)(R)))
758
759static __inline__ __m128h
760 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_max_sh(__m128h __A, __m128h __B) {
761 return (__m128h)__builtin_ia32_maxsh_round_mask(
762 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
764}
765
766static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
767_mm_mask_max_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
768 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
769 (__v8hf)__W, (__mmask8)__U,
771}
772
773static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
774_mm_maskz_max_sh(__mmask8 __U, __m128h __A, __m128h __B) {
775 return (__m128h)__builtin_ia32_maxsh_round_mask(
776 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
778}
779
780#define _mm_max_round_sh(A, B, R) \
781 ((__m128h)__builtin_ia32_maxsh_round_mask( \
782 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
783 (__mmask8)-1, (int)(R)))
784
785#define _mm_mask_max_round_sh(W, U, A, B, R) \
786 ((__m128h)__builtin_ia32_maxsh_round_mask( \
787 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
788 (__mmask8)(U), (int)(R)))
789
790#define _mm_maskz_max_round_sh(U, A, B, R) \
791 ((__m128h)__builtin_ia32_maxsh_round_mask( \
792 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
793 (__mmask8)(U), (int)(R)))
794
795#define _mm512_cmp_round_ph_mask(A, B, P, R) \
796 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
797 (__v32hf)(__m512h)(B), (int)(P), \
798 (__mmask32)-1, (int)(R)))
799
800#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
801 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
802 (__v32hf)(__m512h)(B), (int)(P), \
803 (__mmask32)(U), (int)(R)))
804
805#define _mm512_cmp_ph_mask(A, B, P) \
806 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
807
808#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
809 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
810
811#define _mm_cmp_round_sh_mask(X, Y, P, R) \
812 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
813 (__v8hf)(__m128h)(Y), (int)(P), \
814 (__mmask8)-1, (int)(R)))
815
816#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
817 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
818 (__v8hf)(__m128h)(Y), (int)(P), \
819 (__mmask8)(M), (int)(R)))
820
821#define _mm_cmp_sh_mask(X, Y, P) \
822 ((__mmask8)__builtin_ia32_cmpsh_mask( \
823 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
824 _MM_FROUND_CUR_DIRECTION))
825
826#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
827 ((__mmask8)__builtin_ia32_cmpsh_mask( \
828 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
829 _MM_FROUND_CUR_DIRECTION))
830// loads with vmovsh:
831static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
832 struct __mm_load_sh_struct {
833 _Float16 __u;
834 } __attribute__((__packed__, __may_alias__));
835 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
836 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
837}
838
839static __inline__ __m128h __DEFAULT_FN_ATTRS128
840_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
841 __m128h src = (__v8hf)__builtin_shufflevector(
842 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
843
844 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
845}
846
847static __inline__ __m128h __DEFAULT_FN_ATTRS128
848_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
849 return (__m128h)__builtin_ia32_loadsh128_mask(
850 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
851}
852
853static __inline__ __m512h __DEFAULT_FN_ATTRS512
854_mm512_load_ph(void const *__p) {
855 return *(const __m512h *)__p;
856}
857
858static __inline__ __m256h __DEFAULT_FN_ATTRS256
859_mm256_load_ph(void const *__p) {
860 return *(const __m256h *)__p;
861}
862
863static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
864 return *(const __m128h *)__p;
865}
866
867static __inline__ __m512h __DEFAULT_FN_ATTRS512
868_mm512_loadu_ph(void const *__p) {
869 struct __loadu_ph {
870 __m512h_u __v;
871 } __attribute__((__packed__, __may_alias__));
872 return ((const struct __loadu_ph *)__p)->__v;
873}
874
875static __inline__ __m256h __DEFAULT_FN_ATTRS256
876_mm256_loadu_ph(void const *__p) {
877 struct __loadu_ph {
878 __m256h_u __v;
879 } __attribute__((__packed__, __may_alias__));
880 return ((const struct __loadu_ph *)__p)->__v;
881}
882
883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
884 struct __loadu_ph {
885 __m128h_u __v;
886 } __attribute__((__packed__, __may_alias__));
887 return ((const struct __loadu_ph *)__p)->__v;
888}
889
890// stores with vmovsh:
891static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
892 __m128h __a) {
893 struct __mm_store_sh_struct {
894 _Float16 __u;
895 } __attribute__((__packed__, __may_alias__));
896 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
897}
898
899static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
900 __mmask8 __U,
901 __m128h __A) {
902 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
903}
904
905static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
906 __m512h __A) {
907 *(__m512h *)__P = __A;
908}
909
910static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
911 __m256h __A) {
912 *(__m256h *)__P = __A;
913}
914
915static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
916 __m128h __A) {
917 *(__m128h *)__P = __A;
918}
919
920static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
921 __m512h __A) {
922 struct __storeu_ph {
923 __m512h_u __v;
924 } __attribute__((__packed__, __may_alias__));
925 ((struct __storeu_ph *)__P)->__v = __A;
926}
927
928static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
929 __m256h __A) {
930 struct __storeu_ph {
931 __m256h_u __v;
932 } __attribute__((__packed__, __may_alias__));
933 ((struct __storeu_ph *)__P)->__v = __A;
934}
935
936static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
937 __m128h __A) {
938 struct __storeu_ph {
939 __m128h_u __v;
940 } __attribute__((__packed__, __may_alias__));
941 ((struct __storeu_ph *)__P)->__v = __A;
942}
943
944// moves with vmovsh:
945static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
946_mm_move_sh(__m128h __a, __m128h __b) {
947 __a[0] = __b[0];
948 return __a;
949}
950
951static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
952_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
953 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
954}
955
956static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
957_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
958 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
959 _mm_setzero_ph());
960}
961
962// vmovw:
963static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
964 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
965}
966
967static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
968 __v8hi __b = (__v8hi)__a;
969 return __b[0];
970}
971
972static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
973 return (__m512h)__builtin_ia32_rcpph512_mask(
974 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
975}
976
977static __inline__ __m512h __DEFAULT_FN_ATTRS512
978_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
979 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
980 (__mmask32)__U);
981}
982
983static __inline__ __m512h __DEFAULT_FN_ATTRS512
984_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
985 return (__m512h)__builtin_ia32_rcpph512_mask(
986 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
987}
988
989static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
990 return (__m512h)__builtin_ia32_rsqrtph512_mask(
991 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
992}
993
994static __inline__ __m512h __DEFAULT_FN_ATTRS512
995_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
996 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
997 (__mmask32)__U);
998}
999
1000static __inline__ __m512h __DEFAULT_FN_ATTRS512
1001_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1003 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1004}
1005
1006#define _mm512_getmant_ph(A, B, C) \
1007 ((__m512h)__builtin_ia32_getmantph512_mask( \
1008 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1009 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1010 _MM_FROUND_CUR_DIRECTION))
1011
1012#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1015 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1016
1017#define _mm512_maskz_getmant_ph(U, A, B, C) \
1018 ((__m512h)__builtin_ia32_getmantph512_mask( \
1019 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1020 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1021
1022#define _mm512_getmant_round_ph(A, B, C, R) \
1023 ((__m512h)__builtin_ia32_getmantph512_mask( \
1024 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1025 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1026
1027#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1030 (__mmask32)(U), (int)(R)))
1031
1032#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1035 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1036
1037static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1038 return (__m512h)__builtin_ia32_getexpph512_mask(
1039 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1041}
1042
1043static __inline__ __m512h __DEFAULT_FN_ATTRS512
1044_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1045 return (__m512h)__builtin_ia32_getexpph512_mask(
1046 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1047}
1048
1049static __inline__ __m512h __DEFAULT_FN_ATTRS512
1050_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1052 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1054}
1055
1056#define _mm512_getexp_round_ph(A, R) \
1057 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1058 (__v32hf)_mm512_undefined_ph(), \
1059 (__mmask32)-1, (int)(R)))
1060
1061#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask( \
1063 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1064
1065#define _mm512_maskz_getexp_round_ph(U, A, R) \
1066 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1067 (__v32hf)_mm512_setzero_ph(), \
1068 (__mmask32)(U), (int)(R)))
1069
1070static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1071 __m512h __B) {
1072 return (__m512h)__builtin_ia32_scalefph512_mask(
1073 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1075}
1076
1077static __inline__ __m512h __DEFAULT_FN_ATTRS512
1078_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1079 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1080 (__v32hf)__W, (__mmask32)__U,
1082}
1083
1084static __inline__ __m512h __DEFAULT_FN_ATTRS512
1085_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1086 return (__m512h)__builtin_ia32_scalefph512_mask(
1087 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1089}
1090
1091#define _mm512_scalef_round_ph(A, B, R) \
1092 ((__m512h)__builtin_ia32_scalefph512_mask( \
1093 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1094 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1095
1096#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1097 ((__m512h)__builtin_ia32_scalefph512_mask( \
1098 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1099 (__mmask32)(U), (int)(R)))
1100
1101#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1102 ((__m512h)__builtin_ia32_scalefph512_mask( \
1103 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1104 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1105
1106#define _mm512_roundscale_ph(A, B) \
1107 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1108 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1109 _MM_FROUND_CUR_DIRECTION))
1110
1111#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1112 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1113 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1114 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1115
1116#define _mm512_maskz_roundscale_ph(A, B, imm) \
1117 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1118 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1119 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1120
1121#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1122 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1123 (__v32hf)(__m512h)(A), \
1124 (__mmask32)(B), (int)(R)))
1125
1126#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1128 (__v32hf)_mm512_setzero_ph(), \
1129 (__mmask32)(A), (int)(R)))
1130
1131#define _mm512_roundscale_round_ph(A, imm, R) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1133 (__v32hf)_mm512_undefined_ph(), \
1134 (__mmask32)-1, (int)(R)))
1135
1136#define _mm512_reduce_ph(A, imm) \
1137 ((__m512h)__builtin_ia32_reduceph512_mask( \
1138 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1139 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1140
1141#define _mm512_mask_reduce_ph(W, U, A, imm) \
1142 ((__m512h)__builtin_ia32_reduceph512_mask( \
1143 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1144 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1145
1146#define _mm512_maskz_reduce_ph(U, A, imm) \
1147 ((__m512h)__builtin_ia32_reduceph512_mask( \
1148 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1150
1151#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1152 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)(__m512h)(W), \
1154 (__mmask32)(U), (int)(R)))
1155
1156#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1158 (__v32hf)_mm512_setzero_ph(), \
1159 (__mmask32)(U), (int)(R)))
1160
1161#define _mm512_reduce_round_ph(A, imm, R) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1163 (__v32hf)_mm512_undefined_ph(), \
1164 (__mmask32)-1, (int)(R)))
1165
1166static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1167 __m128h __B) {
1168 return (__m128h)__builtin_ia32_rcpsh_mask(
1169 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1170}
1171
1172static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1173 __mmask8 __U,
1174 __m128h __A,
1175 __m128h __B) {
1176 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1177 (__v8hf)__W, (__mmask8)__U);
1178}
1179
1180static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1181 __m128h __A,
1182 __m128h __B) {
1183 return (__m128h)__builtin_ia32_rcpsh_mask(
1184 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1185}
1186
1187static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1188 __m128h __B) {
1189 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1191}
1192
1193static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1194 __mmask8 __U,
1195 __m128h __A,
1196 __m128h __B) {
1197 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1198 (__v8hf)__W, (__mmask8)__U);
1199}
1200
1201static __inline__ __m128h __DEFAULT_FN_ATTRS128
1202_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1205}
1206
1207#define _mm_getmant_round_sh(A, B, C, D, R) \
1208 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1209 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1210 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1211
1212#define _mm_getmant_sh(A, B, C, D) \
1213 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1214 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1215 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1216
1217#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1218 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1219 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1220 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1221
1222#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1223 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1224 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1225 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1226
1227#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1231
1232#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1236
1237#define _mm_getexp_round_sh(A, B, R) \
1238 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1240 (__mmask8)-1, (int)(R)))
1241
1242static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1243 __m128h __B) {
1244 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1245 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1247}
1248
1249static __inline__ __m128h __DEFAULT_FN_ATTRS128
1250_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1251 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1252 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1254}
1255
1256#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1257 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1258 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1259 (__mmask8)(U), (int)(R)))
1260
1261static __inline__ __m128h __DEFAULT_FN_ATTRS128
1262_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1263 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1264 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1266}
1267
1268#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1269 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1270 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1271 (__mmask8)(U), (int)(R)))
1272
1273#define _mm_scalef_round_sh(A, B, R) \
1274 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1275 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1276 (__mmask8)-1, (int)(R)))
1277
1278static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1279 __m128h __B) {
1280 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1281 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1283}
1284
1285static __inline__ __m128h __DEFAULT_FN_ATTRS128
1286_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1287 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1288 (__v8hf)__W, (__mmask8)__U,
1290}
1291
1292#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1293 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1294 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1295 (__mmask8)(U), (int)(R)))
1296
1297static __inline__ __m128h __DEFAULT_FN_ATTRS128
1298_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1299 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1300 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1302}
1303
1304#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1305 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1306 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1307 (__mmask8)(U), (int)(R)))
1308
1309#define _mm_roundscale_round_sh(A, B, imm, R) \
1310 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1311 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1312 (__mmask8)-1, (int)(imm), (int)(R)))
1313
1314#define _mm_roundscale_sh(A, B, imm) \
1315 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1316 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1317 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1318
1319#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1320 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1321 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1322 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1323
1324#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1325 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1327 (__mmask8)(U), (int)(I), (int)(R)))
1328
1329#define _mm_maskz_roundscale_sh(U, A, B, I) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1333
1334#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)(U), (int)(I), (int)(R)))
1338
1339#define _mm_reduce_sh(A, B, C) \
1340 ((__m128h)__builtin_ia32_reducesh_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1342 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1343
1344#define _mm_mask_reduce_sh(W, U, A, B, C) \
1345 ((__m128h)__builtin_ia32_reducesh_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1348
1349#define _mm_maskz_reduce_sh(U, A, B, C) \
1350 ((__m128h)__builtin_ia32_reducesh_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1353
1354#define _mm_reduce_round_sh(A, B, C, R) \
1355 ((__m128h)__builtin_ia32_reducesh_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)-1, (int)(C), (int)(R)))
1358
1359#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1362 (__mmask8)(U), (int)(C), (int)(R)))
1363
1364#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1367 (__mmask8)(U), (int)(C), (int)(R)))
1368
1369#define _mm512_sqrt_round_ph(A, R) \
1370 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1371
1372#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1373 ((__m512h)__builtin_ia32_selectph_512( \
1374 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1375 (__v32hf)(__m512h)(W)))
1376
1377#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1378 ((__m512h)__builtin_ia32_selectph_512( \
1379 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1380 (__v32hf)_mm512_setzero_ph()))
1381
1382static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1383 return (__m512h)__builtin_elementwise_sqrt((__v32hf)__A);
1384}
1385
1386static __inline__ __m512h __DEFAULT_FN_ATTRS512
1387_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1388 return (__m512h)__builtin_ia32_selectph_512(
1389 (__mmask32)(__U), (__v32hf)_mm512_sqrt_ph(__A), (__v32hf)(__m512h)(__W));
1390}
1391
1392static __inline__ __m512h __DEFAULT_FN_ATTRS512
1393_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1394 return (__m512h)__builtin_ia32_selectph_512((__mmask32)(__U),
1395 (__v32hf)_mm512_sqrt_ph(__A),
1396 (__v32hf)_mm512_setzero_ph());
1397}
1398
1399#define _mm_sqrt_round_sh(A, B, R) \
1400 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1401 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1402 (__mmask8)-1, (int)(R)))
1403
1404#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1405 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1406 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1407 (__mmask8)(U), (int)(R)))
1408
1409#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1410 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1411 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1412 (__mmask8)(U), (int)(R)))
1413
1414static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1415 __m128h __B) {
1416 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1419}
1420
1421static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1422 __mmask32 __U,
1423 __m128h __A,
1424 __m128h __B) {
1425 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1428}
1429
1430static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1431 __m128h __A,
1432 __m128h __B) {
1433 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1436}
1437
1438#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1439 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1440 (int)(imm), (__mmask32)(U)))
1441
1442#define _mm512_fpclass_ph_mask(A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)-1))
1445
1446#define _mm_fpclass_sh_mask(A, imm) \
1447 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1448 (__mmask8)-1))
1449
1450#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1452 (__mmask8)(U)))
1453
1454#define _mm512_cvt_roundpd_ph(A, R) \
1455 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1456 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1457
1458#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1460 (__mmask8)(U), (int)(R)))
1461
1462#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1464 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1465
1466static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1467 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1470}
1471
1472static __inline__ __m128h __DEFAULT_FN_ATTRS512
1473_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1474 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1476}
1477
1478static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1483}
1484
1485#define _mm512_cvt_roundph_pd(A, R) \
1486 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1487 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1488
1489#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1491 (__mmask8)(U), (int)(R)))
1492
1493#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1495 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1496
1497static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1498 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1499 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1501}
1502
1503static __inline__ __m512d __DEFAULT_FN_ATTRS512
1504_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1505 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1507}
1508
1509static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1514}
1515
1516#define _mm_cvt_roundsh_ss(A, B, R) \
1517 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1518 (__v4sf)_mm_undefined_ps(), \
1519 (__mmask8)(-1), (int)(R)))
1520
1521#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1522 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1523 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1524
1525#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1527 (__v4sf)_mm_setzero_ps(), \
1528 (__mmask8)(U), (int)(R)))
1529
1530static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1531 __m128h __B) {
1532 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1533 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1535}
1536
1537static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1538 __mmask8 __U,
1539 __m128 __A,
1540 __m128h __B) {
1541 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1542 (__v4sf)__W, (__mmask8)__U,
1544}
1545
1546static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1547 __m128 __A,
1548 __m128h __B) {
1549 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1550 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1552}
1553
1554#define _mm_cvt_roundss_sh(A, B, R) \
1555 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1556 (__v8hf)_mm_undefined_ph(), \
1557 (__mmask8)(-1), (int)(R)))
1558
1559#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1560 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1561 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1562
1563#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1565 (__v8hf)_mm_setzero_ph(), \
1566 (__mmask8)(U), (int)(R)))
1567
1568static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1569 __m128 __B) {
1570 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1573}
1574
1575static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1576 __mmask8 __U,
1577 __m128h __A,
1578 __m128 __B) {
1579 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1582}
1583
1584static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1585 __m128h __A,
1586 __m128 __B) {
1587 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1590}
1591
1592#define _mm_cvt_roundsd_sh(A, B, R) \
1593 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1594 (__v8hf)_mm_undefined_ph(), \
1595 (__mmask8)(-1), (int)(R)))
1596
1597#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1598 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1599 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1600
1601#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1603 (__v8hf)_mm_setzero_ph(), \
1604 (__mmask8)(U), (int)(R)))
1605
1606static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1607 __m128d __B) {
1608 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1611}
1612
1613static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1614 __mmask8 __U,
1615 __m128h __A,
1616 __m128d __B) {
1617 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1620}
1621
1622static __inline__ __m128h __DEFAULT_FN_ATTRS128
1623_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1624 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1627}
1628
1629#define _mm_cvt_roundsh_sd(A, B, R) \
1630 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1631 (__v2df)_mm_undefined_pd(), \
1632 (__mmask8)(-1), (int)(R)))
1633
1634#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1635 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1636 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1637
1638#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1640 (__v2df)_mm_setzero_pd(), \
1641 (__mmask8)(U), (int)(R)))
1642
1643static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1644 __m128h __B) {
1645 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1646 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1648}
1649
1650static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1651 __mmask8 __U,
1652 __m128d __A,
1653 __m128h __B) {
1654 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1657}
1658
1659static __inline__ __m128d __DEFAULT_FN_ATTRS128
1660_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1661 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1664}
1665
1666#define _mm512_cvt_roundph_epi16(A, R) \
1667 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1668 (__v32hi)_mm512_undefined_epi32(), \
1669 (__mmask32)(-1), (int)(R)))
1670
1671#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1672 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1673 (__mmask32)(U), (int)(R)))
1674
1675#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1677 (__v32hi)_mm512_setzero_epi32(), \
1678 (__mmask32)(U), (int)(R)))
1679
1680static __inline__ __m512i __DEFAULT_FN_ATTRS512
1681_mm512_cvtph_epi16(__m512h __A) {
1682 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1683 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1685}
1686
1687static __inline__ __m512i __DEFAULT_FN_ATTRS512
1688_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1689 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1691}
1692
1693static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1698}
1699
1700#define _mm512_cvtt_roundph_epi16(A, R) \
1701 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1702 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1703 (int)(R)))
1704
1705#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1706 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1707 (__mmask32)(U), (int)(R)))
1708
1709#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1711 (__v32hi)_mm512_setzero_epi32(), \
1712 (__mmask32)(U), (int)(R)))
1713
1714static __inline__ __m512i __DEFAULT_FN_ATTRS512
1715_mm512_cvttph_epi16(__m512h __A) {
1716 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1717 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1719}
1720
1721static __inline__ __m512i __DEFAULT_FN_ATTRS512
1722_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1723 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1725}
1726
1727static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1732}
1733
1734#define _mm512_cvt_roundepi16_ph(A, R) \
1735 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1736 (__v32hf)_mm512_undefined_ph(), \
1737 (__mmask32)(-1), (int)(R)))
1738
1739#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1740 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1741 (__mmask32)(U), (int)(R)))
1742
1743#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1745 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1746
1747static __inline__ __m512h __DEFAULT_FN_ATTRS512
1748_mm512_cvtepi16_ph(__m512i __A) {
1749 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1752}
1753
1754static __inline__ __m512h __DEFAULT_FN_ATTRS512
1755_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1756 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1758}
1759
1760static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1765}
1766
1767#define _mm512_cvt_roundph_epu16(A, R) \
1768 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1769 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1770 (int)(R)))
1771
1772#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1773 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1774 (__mmask32)(U), (int)(R)))
1775
1776#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1778 (__v32hu)_mm512_setzero_epi32(), \
1779 (__mmask32)(U), (int)(R)))
1780
1781static __inline__ __m512i __DEFAULT_FN_ATTRS512
1782_mm512_cvtph_epu16(__m512h __A) {
1783 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1784 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1786}
1787
1788static __inline__ __m512i __DEFAULT_FN_ATTRS512
1789_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1790 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1792}
1793
1794static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1799}
1800
1801#define _mm512_cvtt_roundph_epu16(A, R) \
1802 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1803 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1804 (int)(R)))
1805
1806#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1807 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1808 (__mmask32)(U), (int)(R)))
1809
1810#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1812 (__v32hu)_mm512_setzero_epi32(), \
1813 (__mmask32)(U), (int)(R)))
1814
1815static __inline__ __m512i __DEFAULT_FN_ATTRS512
1816_mm512_cvttph_epu16(__m512h __A) {
1817 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1818 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1820}
1821
1822static __inline__ __m512i __DEFAULT_FN_ATTRS512
1823_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1824 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1826}
1827
1828static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1833}
1834
1835#define _mm512_cvt_roundepu16_ph(A, R) \
1836 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1837 (__v32hf)_mm512_undefined_ph(), \
1838 (__mmask32)(-1), (int)(R)))
1839
1840#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1841 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1842 (__mmask32)(U), (int)(R)))
1843
1844#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1846 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1847
1848static __inline__ __m512h __DEFAULT_FN_ATTRS512
1849_mm512_cvtepu16_ph(__m512i __A) {
1850 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1853}
1854
1855static __inline__ __m512h __DEFAULT_FN_ATTRS512
1856_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1857 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1859}
1860
1861static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1866}
1867
1868#define _mm512_cvt_roundph_epi32(A, R) \
1869 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1870 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1871 (int)(R)))
1872
1873#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1874 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1875 (__mmask16)(U), (int)(R)))
1876
1877#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1879 (__v16si)_mm512_setzero_epi32(), \
1880 (__mmask16)(U), (int)(R)))
1881
1882static __inline__ __m512i __DEFAULT_FN_ATTRS512
1883_mm512_cvtph_epi32(__m256h __A) {
1884 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1885 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1887}
1888
1889static __inline__ __m512i __DEFAULT_FN_ATTRS512
1890_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1891 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1893}
1894
1895static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1900}
1901
1902#define _mm512_cvt_roundph_epu32(A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1904 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1905 (int)(R)))
1906
1907#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1908 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1909 (__mmask16)(U), (int)(R)))
1910
1911#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1913 (__v16su)_mm512_setzero_epi32(), \
1914 (__mmask16)(U), (int)(R)))
1915
1916static __inline__ __m512i __DEFAULT_FN_ATTRS512
1917_mm512_cvtph_epu32(__m256h __A) {
1918 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1919 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1921}
1922
1923static __inline__ __m512i __DEFAULT_FN_ATTRS512
1924_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1925 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1927}
1928
1929static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1934}
1935
1936#define _mm512_cvt_roundepi32_ph(A, R) \
1937 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1938 (__v16hf)_mm256_undefined_ph(), \
1939 (__mmask16)(-1), (int)(R)))
1940
1941#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1942 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1943 (__mmask16)(U), (int)(R)))
1944
1945#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1947 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1948
1949static __inline__ __m256h __DEFAULT_FN_ATTRS512
1950_mm512_cvtepi32_ph(__m512i __A) {
1951 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1954}
1955
1956static __inline__ __m256h __DEFAULT_FN_ATTRS512
1957_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1958 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1960}
1961
1962static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1967}
1968
1969#define _mm512_cvt_roundepu32_ph(A, R) \
1970 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1971 (__v16hf)_mm256_undefined_ph(), \
1972 (__mmask16)(-1), (int)(R)))
1973
1974#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1975 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1976 (__mmask16)(U), (int)(R)))
1977
1978#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1980 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1981
1982static __inline__ __m256h __DEFAULT_FN_ATTRS512
1983_mm512_cvtepu32_ph(__m512i __A) {
1984 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1987}
1988
1989static __inline__ __m256h __DEFAULT_FN_ATTRS512
1990_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1991 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1993}
1994
1995static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2000}
2001
2002#define _mm512_cvtt_roundph_epi32(A, R) \
2003 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2004 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2005 (int)(R)))
2006
2007#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2008 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2009 (__mmask16)(U), (int)(R)))
2010
2011#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2013 (__v16si)_mm512_setzero_epi32(), \
2014 (__mmask16)(U), (int)(R)))
2015
2016static __inline__ __m512i __DEFAULT_FN_ATTRS512
2017_mm512_cvttph_epi32(__m256h __A) {
2018 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2019 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2021}
2022
2023static __inline__ __m512i __DEFAULT_FN_ATTRS512
2024_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2025 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2027}
2028
2029static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2034}
2035
2036#define _mm512_cvtt_roundph_epu32(A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2038 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2039 (int)(R)))
2040
2041#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2042 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2043 (__mmask16)(U), (int)(R)))
2044
2045#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2047 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2048 (int)(R)))
2049
2050static __inline__ __m512i __DEFAULT_FN_ATTRS512
2051_mm512_cvttph_epu32(__m256h __A) {
2052 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2053 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2055}
2056
2057static __inline__ __m512i __DEFAULT_FN_ATTRS512
2058_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2059 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2061}
2062
2063static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2068}
2069
2070#define _mm512_cvt_roundepi64_ph(A, R) \
2071 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2072 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2073
2074#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2076 (__mmask8)(U), (int)(R)))
2077
2078#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2080 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2081
2082static __inline__ __m128h __DEFAULT_FN_ATTRS512
2083_mm512_cvtepi64_ph(__m512i __A) {
2084 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2087}
2088
2089static __inline__ __m128h __DEFAULT_FN_ATTRS512
2090_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2091 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2093}
2094
2095static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2100}
2101
2102#define _mm512_cvt_roundph_epi64(A, R) \
2103 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2104 (__v8di)_mm512_undefined_epi32(), \
2105 (__mmask8)(-1), (int)(R)))
2106
2107#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2108 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2109 (__mmask8)(U), (int)(R)))
2110
2111#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2113 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2114
2115static __inline__ __m512i __DEFAULT_FN_ATTRS512
2116_mm512_cvtph_epi64(__m128h __A) {
2117 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2118 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2120}
2121
2122static __inline__ __m512i __DEFAULT_FN_ATTRS512
2123_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2124 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2126}
2127
2128static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2133}
2134
2135#define _mm512_cvt_roundepu64_ph(A, R) \
2136 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2137 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2138
2139#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2141 (__mmask8)(U), (int)(R)))
2142
2143#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2145 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2146
2147static __inline__ __m128h __DEFAULT_FN_ATTRS512
2148_mm512_cvtepu64_ph(__m512i __A) {
2149 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2152}
2153
2154static __inline__ __m128h __DEFAULT_FN_ATTRS512
2155_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2156 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2158}
2159
2160static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2165}
2166
2167#define _mm512_cvt_roundph_epu64(A, R) \
2168 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2169 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2170 (int)(R)))
2171
2172#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2173 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2174 (__mmask8)(U), (int)(R)))
2175
2176#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2178 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2179
2180static __inline__ __m512i __DEFAULT_FN_ATTRS512
2181_mm512_cvtph_epu64(__m128h __A) {
2182 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2183 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2185}
2186
2187static __inline__ __m512i __DEFAULT_FN_ATTRS512
2188_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2189 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2191}
2192
2193static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2198}
2199
2200#define _mm512_cvtt_roundph_epi64(A, R) \
2201 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2202 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2203 (int)(R)))
2204
2205#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2206 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2207 (__mmask8)(U), (int)(R)))
2208
2209#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2211 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2212
2213static __inline__ __m512i __DEFAULT_FN_ATTRS512
2214_mm512_cvttph_epi64(__m128h __A) {
2215 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2216 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2218}
2219
2220static __inline__ __m512i __DEFAULT_FN_ATTRS512
2221_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2222 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2224}
2225
2226static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2231}
2232
2233#define _mm512_cvtt_roundph_epu64(A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2235 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2236 (int)(R)))
2237
2238#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2239 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2240 (__mmask8)(U), (int)(R)))
2241
2242#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2244 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2245
2246static __inline__ __m512i __DEFAULT_FN_ATTRS512
2247_mm512_cvttph_epu64(__m128h __A) {
2248 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2249 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2251}
2252
2253static __inline__ __m512i __DEFAULT_FN_ATTRS512
2254_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2255 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2257}
2258
2259static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2264}
2265
2266#define _mm_cvt_roundsh_i32(A, R) \
2267 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2268
2269static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2270 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2271}
2272
2273#define _mm_cvt_roundsh_u32(A, R) \
2274 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2275
2276static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2277_mm_cvtsh_u32(__m128h __A) {
2278 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2280}
2281
2282#ifdef __x86_64__
2283#define _mm_cvt_roundsh_i64(A, R) \
2284 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2285
2286static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2287 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2289}
2290
2291#define _mm_cvt_roundsh_u64(A, R) \
2292 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2293
2294static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2295_mm_cvtsh_u64(__m128h __A) {
2296 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2298}
2299#endif // __x86_64__
2300
2301#define _mm_cvt_roundu32_sh(A, B, R) \
2302 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2303
2304static __inline__ __m128h __DEFAULT_FN_ATTRS128
2305_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2306 __A[0] = __B;
2307 return __A;
2308}
2309
2310#ifdef __x86_64__
2311#define _mm_cvt_roundu64_sh(A, B, R) \
2312 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2313 (int)(R)))
2314
2315static __inline__ __m128h __DEFAULT_FN_ATTRS128
2316_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2317 __A[0] = __B;
2318 return __A;
2319}
2320#endif
2321
2322#define _mm_cvt_roundi32_sh(A, B, R) \
2323 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2324
2325static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2326 int __B) {
2327 __A[0] = __B;
2328 return __A;
2329}
2330
2331#ifdef __x86_64__
2332#define _mm_cvt_roundi64_sh(A, B, R) \
2333 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2334
2335static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2336 long long __B) {
2337 __A[0] = __B;
2338 return __A;
2339}
2340#endif
2341
2342#define _mm_cvtt_roundsh_i32(A, R) \
2343 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2344
2345static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2346 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2348}
2349
2350#ifdef __x86_64__
2351#define _mm_cvtt_roundsh_i64(A, R) \
2352 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2353
2354static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2355 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2357}
2358#endif
2359
2360#define _mm_cvtt_roundsh_u32(A, R) \
2361 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2362
2363static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2364_mm_cvttsh_u32(__m128h __A) {
2365 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2367}
2368
2369#ifdef __x86_64__
2370#define _mm_cvtt_roundsh_u64(A, R) \
2371 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2372
2373static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2374_mm_cvttsh_u64(__m128h __A) {
2375 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2377}
2378#endif
2379
2380#define _mm512_cvtx_roundph_ps(A, R) \
2381 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2382 (__v16sf)_mm512_undefined_ps(), \
2383 (__mmask16)(-1), (int)(R)))
2384
2385#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2386 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2387 (__mmask16)(U), (int)(R)))
2388
2389#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2391 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2392
2393static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2394 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2395 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2397}
2398
2399static __inline__ __m512 __DEFAULT_FN_ATTRS512
2400_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2401 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2403}
2404
2405static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2410}
2411
2412#define _mm512_cvtx_roundps_ph(A, R) \
2413 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2414 (__v16hf)_mm256_undefined_ph(), \
2415 (__mmask16)(-1), (int)(R)))
2416
2417#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2418 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2419 (__mmask16)(U), (int)(R)))
2420
2421#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2423 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2424
2425static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2426 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2429}
2430
2431static __inline__ __m256h __DEFAULT_FN_ATTRS512
2432_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2433 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2435}
2436
2437static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2442}
2443
2444#define _mm512_fmadd_round_ph(A, B, C, R) \
2445 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2446 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2447 (__mmask32)-1, (int)(R)))
2448
2449#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2450 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2451 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2452 (__mmask32)(U), (int)(R)))
2453
2454#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2455 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2456 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2457 (__mmask32)(U), (int)(R)))
2458
2459#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2460 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2461 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2462 (__mmask32)(U), (int)(R)))
2463
2464#define _mm512_fmsub_round_ph(A, B, C, R) \
2465 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2466 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2467 (__mmask32)-1, (int)(R)))
2468
2469#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2472 (__mmask32)(U), (int)(R)))
2473
2474#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2478
2479#define _mm512_fnmadd_round_ph(A, B, C, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2481 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)-1, (int)(R)))
2483
2484#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2486 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2488
2489#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2491 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2492 (__mmask32)(U), (int)(R)))
2493
2494#define _mm512_fnmsub_round_ph(A, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)-1, (int)(R)))
2498
2499#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2503
2504static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2505 __m512h __B,
2506 __m512h __C) {
2507 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2508 (__v32hf)__C, (__mmask32)-1,
2510}
2511
2512static __inline__ __m512h __DEFAULT_FN_ATTRS512
2513_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2514 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515 (__v32hf)__C, (__mmask32)__U,
2517}
2518
2519static __inline__ __m512h __DEFAULT_FN_ATTRS512
2520_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2521 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2522 (__v32hf)__C, (__mmask32)__U,
2524}
2525
2526static __inline__ __m512h __DEFAULT_FN_ATTRS512
2527_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2529 (__v32hf)__C, (__mmask32)__U,
2531}
2532
2533static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2534 __m512h __B,
2535 __m512h __C) {
2536 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537 -(__v32hf)__C, (__mmask32)-1,
2539}
2540
2541static __inline__ __m512h __DEFAULT_FN_ATTRS512
2542_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2543 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544 -(__v32hf)__C, (__mmask32)__U,
2546}
2547
2548static __inline__ __m512h __DEFAULT_FN_ATTRS512
2549_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2553}
2554
2555static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2556 __m512h __B,
2557 __m512h __C) {
2558 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2559 (__v32hf)__C, (__mmask32)-1,
2561}
2562
2563static __inline__ __m512h __DEFAULT_FN_ATTRS512
2564_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2565 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2566 (__v32hf)__C, (__mmask32)__U,
2568}
2569
2570static __inline__ __m512h __DEFAULT_FN_ATTRS512
2571_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2573 (__v32hf)__C, (__mmask32)__U,
2575}
2576
2577static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2578 __m512h __B,
2579 __m512h __C) {
2580 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2581 -(__v32hf)__C, (__mmask32)-1,
2583}
2584
2585static __inline__ __m512h __DEFAULT_FN_ATTRS512
2586_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2590}
2591
2592#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2593 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2594 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2595 (__mmask32)-1, (int)(R)))
2596
2597#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2598 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2599 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2600 (__mmask32)(U), (int)(R)))
2601
2602#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2603 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2604 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2605 (__mmask32)(U), (int)(R)))
2606
2607#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2608 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2609 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2610 (__mmask32)(U), (int)(R)))
2611
2612#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2613 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2614 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2615 (__mmask32)-1, (int)(R)))
2616
2617#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2620 (__mmask32)(U), (int)(R)))
2621
2622#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2626
2627static __inline__ __m512h __DEFAULT_FN_ATTRS512
2628_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2632}
2633
2634static __inline__ __m512h __DEFAULT_FN_ATTRS512
2635_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2636 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2639}
2640
2641static __inline__ __m512h __DEFAULT_FN_ATTRS512
2642_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2643 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2646}
2647
2648static __inline__ __m512h __DEFAULT_FN_ATTRS512
2649_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2653}
2654
2655static __inline__ __m512h __DEFAULT_FN_ATTRS512
2656_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2660}
2661
2662static __inline__ __m512h __DEFAULT_FN_ATTRS512
2663_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2664 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2667}
2668
2669static __inline__ __m512h __DEFAULT_FN_ATTRS512
2670_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2674}
2675
2676#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2677 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2678 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2679 (__mmask32)(U), (int)(R)))
2680
2681static __inline__ __m512h __DEFAULT_FN_ATTRS512
2682_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2683 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2684 (__v32hf)__C, (__mmask32)__U,
2686}
2687
2688#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2689 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2690 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2691 (__mmask32)(U), (int)(R)))
2692
2693static __inline__ __m512h __DEFAULT_FN_ATTRS512
2694_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2695 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2698}
2699
2700#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2701 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2702 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2704
2705static __inline__ __m512h __DEFAULT_FN_ATTRS512
2706_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2707 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2708 (__v32hf)__C, (__mmask32)__U,
2710}
2711
2712#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2713 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2714 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2716
2717#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2718 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2719 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2720 (__mmask32)(U), (int)(R)))
2721
2722static __inline__ __m512h __DEFAULT_FN_ATTRS512
2723_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2724 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2725 -(__v32hf)__C, (__mmask32)__U,
2727}
2728
2729static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2731 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2732 (__v32hf)__C, (__mmask32)__U,
2734}
2735
2736static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2737 __m128h __A,
2738 __m128h __B) {
2739 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2741}
2742
2743static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2744 __mmask8 __U,
2745 __m128h __A,
2746 __m128h __B) {
2747 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2749}
2750
2751#define _mm_fmadd_round_sh(A, B, C, R) \
2752 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2753 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2754 (__mmask8)-1, (int)(R)))
2755
2756#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2757 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2758 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2759 (__mmask8)(U), (int)(R)))
2760
2761static __inline__ __m128h __DEFAULT_FN_ATTRS128
2762_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2764 (__mmask8)__U,
2766}
2767
2768#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2769 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2771 (__mmask8)(U), (int)(R)))
2772
2773static __inline__ __m128h __DEFAULT_FN_ATTRS128
2774_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2775 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2776 (__mmask8)__U,
2778}
2779
2780#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2783 (__mmask8)(U), (int)(R)))
2784
2785static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2786 __m128h __A,
2787 __m128h __B) {
2788 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2789 -(__v8hf)__B, (__mmask8)-1,
2791}
2792
2793static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2794 __mmask8 __U,
2795 __m128h __A,
2796 __m128h __B) {
2797 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2798 -(__v8hf)__B, (__mmask8)__U,
2800}
2801
2802#define _mm_fmsub_round_sh(A, B, C, R) \
2803 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2805 (__mmask8)-1, (int)(R)))
2806
2807#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2808 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2809 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2810 (__mmask8)(U), (int)(R)))
2811
2812static __inline__ __m128h __DEFAULT_FN_ATTRS128
2813_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2815 -(__v8hf)__C, (__mmask8)__U,
2817}
2818
2819#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2820 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2821 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2822 (__mmask8)(U), (int)R))
2823
2824static __inline__ __m128h __DEFAULT_FN_ATTRS128
2825_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2826 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2827 (__mmask8)__U,
2829}
2830
2831#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2832 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2834 (__mmask8)(U), (int)(R)))
2835
2836static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2837 __m128h __A,
2838 __m128h __B) {
2839 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2841}
2842
2843static __inline__ __m128h __DEFAULT_FN_ATTRS128
2844_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847}
2848
2849#define _mm_fnmadd_round_sh(A, B, C, R) \
2850 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2851 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2852 (__mmask8)-1, (int)(R)))
2853
2854#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2855 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2856 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2857 (__mmask8)(U), (int)(R)))
2858
2859static __inline__ __m128h __DEFAULT_FN_ATTRS128
2860_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2862 (__mmask8)__U,
2864}
2865
2866#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2867 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2868 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2869 (__mmask8)(U), (int)(R)))
2870
2871static __inline__ __m128h __DEFAULT_FN_ATTRS128
2872_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2873 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2874 (__mmask8)__U,
2876}
2877
2878#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2881 (__mmask8)(U), (int)(R)))
2882
2883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2884 __m128h __A,
2885 __m128h __B) {
2886 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2888}
2889
2890static __inline__ __m128h __DEFAULT_FN_ATTRS128
2891_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894}
2895
2896#define _mm_fnmsub_round_sh(A, B, C, R) \
2897 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2898 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2899 (__mmask8)-1, (int)(R)))
2900
2901#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2902 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2903 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2904 (__mmask8)(U), (int)(R)))
2905
2906static __inline__ __m128h __DEFAULT_FN_ATTRS128
2907_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2909 (__mmask8)__U,
2911}
2912
2913#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2914 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2915 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2916 (__mmask8)(U), (int)(R)))
2917
2918static __inline__ __m128h __DEFAULT_FN_ATTRS128
2919_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2920 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2921 (__mmask8)__U,
2923}
2924
2925#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2926 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2928 (__mmask8)(U), (int)(R)))
2929
2930static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2931 __m128h __B,
2932 __m128h __C) {
2933 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2934 (__v4sf)__C, (__mmask8)-1,
2936}
2937
2938static __inline__ __m128h __DEFAULT_FN_ATTRS128
2939_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2940 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2941 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2942}
2943
2944static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2947 (__v4sf)__C, (__mmask8)__U,
2949}
2950
2951static __inline__ __m128h __DEFAULT_FN_ATTRS128
2952_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2953 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2954 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2955}
2956
2957#define _mm_fcmadd_round_sch(A, B, C, R) \
2958 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2959 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2960 (__mmask8)-1, (int)(R)))
2961
2962#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2963 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2964 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2965 (__mmask8)(U), (int)(R)))
2966
2967#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2968 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2969 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2970 (__mmask8)(U), (int)(R)))
2971
2972#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2973 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2974 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2975 (__mmask8)(U), (int)(R)))
2976
2977static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2978 __m128h __B,
2979 __m128h __C) {
2980 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2981 (__v4sf)__C, (__mmask8)-1,
2983}
2984
2985static __inline__ __m128h __DEFAULT_FN_ATTRS128
2986_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2987 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2988 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2989}
2990
2991static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2994 (__v4sf)__C, (__mmask8)__U,
2996}
2997
2998static __inline__ __m128h __DEFAULT_FN_ATTRS128
2999_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3000 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3001 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3002}
3003
3004#define _mm_fmadd_round_sch(A, B, C, R) \
3005 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3006 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3007 (__mmask8)-1, (int)(R)))
3008
3009#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3010 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3011 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3012 (__mmask8)(U), (int)(R)))
3013
3014#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3015 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3016 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3017 (__mmask8)(U), (int)(R)))
3018
3019#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3020 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3021 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3022 (__mmask8)(U), (int)(R)))
3023
3024static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3025 __m128h __B) {
3026 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3029}
3030
3031static __inline__ __m128h __DEFAULT_FN_ATTRS128
3032_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3033 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3034 (__v4sf)__W, (__mmask8)__U,
3036}
3037
3038static __inline__ __m128h __DEFAULT_FN_ATTRS128
3039_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3040 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3043}
3044
3045#define _mm_fcmul_round_sch(A, B, R) \
3046 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3047 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3048 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3049
3050#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3051 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3052 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3053 (__mmask8)(U), (int)(R)))
3054
3055#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3056 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3057 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3058 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3059
3060static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3061 __m128h __B) {
3062 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3065}
3066
3067static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3068 __mmask8 __U,
3069 __m128h __A,
3070 __m128h __B) {
3071 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3072 (__v4sf)__W, (__mmask8)__U,
3074}
3075
3076static __inline__ __m128h __DEFAULT_FN_ATTRS128
3077_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3078 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3081}
3082
3083#define _mm_fmul_round_sch(A, B, R) \
3084 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3085 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3086 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3087
3088#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3089 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3090 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3091 (__mmask8)(U), (int)(R)))
3092
3093#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3094 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3095 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3096 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3097
3098static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3099 __m512h __B) {
3100 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3103}
3104
3105static __inline__ __m512h __DEFAULT_FN_ATTRS512
3106_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3107 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3108 (__v16sf)__W, (__mmask16)__U,
3110}
3111
3112static __inline__ __m512h __DEFAULT_FN_ATTRS512
3113_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3114 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3117}
3118
3119#define _mm512_fcmul_round_pch(A, B, R) \
3120 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3121 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3122 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3123
3124#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3125 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3126 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3127 (__mmask16)(U), (int)(R)))
3128
3129#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3130 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3131 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3132 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3133
3134static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3135 __m512h __B) {
3136 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3139}
3140
3141static __inline__ __m512h __DEFAULT_FN_ATTRS512
3142_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3143 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3144 (__v16sf)__W, (__mmask16)__U,
3146}
3147
3148static __inline__ __m512h __DEFAULT_FN_ATTRS512
3149_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3150 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3153}
3154
3155#define _mm512_fmul_round_pch(A, B, R) \
3156 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3157 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3158 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3159
3160#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3161 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3162 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3163 (__mmask16)(U), (int)(R)))
3164
3165#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3166 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3167 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3168 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3169
3170static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3171 __m512h __B,
3172 __m512h __C) {
3173 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3176}
3177
3178static __inline__ __m512h __DEFAULT_FN_ATTRS512
3179_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3180 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3183}
3184
3185static __inline__ __m512h __DEFAULT_FN_ATTRS512
3186_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3187 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3190}
3191
3192static __inline__ __m512h __DEFAULT_FN_ATTRS512
3193_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3197}
3198
3199#define _mm512_fcmadd_round_pch(A, B, C, R) \
3200 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3201 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3202 (__mmask16)-1, (int)(R)))
3203
3204#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3205 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3206 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3207 (__mmask16)(U), (int)(R)))
3208
3209#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3210 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3211 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3212 (__mmask16)(U), (int)(R)))
3213
3214#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3215 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3216 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3217 (__mmask16)(U), (int)(R)))
3218
3219static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3220 __m512h __B,
3221 __m512h __C) {
3222 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3223 (__v16sf)__C, (__mmask16)-1,
3225}
3226
3227static __inline__ __m512h __DEFAULT_FN_ATTRS512
3228_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3229 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3230 (__v16sf)__C, (__mmask16)__U,
3232}
3233
3234static __inline__ __m512h __DEFAULT_FN_ATTRS512
3235_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3236 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3239}
3240
3241static __inline__ __m512h __DEFAULT_FN_ATTRS512
3242_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3246}
3247
3248#define _mm512_fmadd_round_pch(A, B, C, R) \
3249 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3250 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3251 (__mmask16)-1, (int)(R)))
3252
3253#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3254 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3255 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3256 (__mmask16)(U), (int)(R)))
3257
3258#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3259 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3260 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3261 (__mmask16)(U), (int)(R)))
3262
3263#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3264 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3265 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3266 (__mmask16)(U), (int)(R)))
3267
3268static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3269_mm512_reduce_add_ph(__m512h __W) {
3270 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3271}
3272
3273static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3274_mm512_reduce_mul_ph(__m512h __W) {
3275 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3276}
3277
3278static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3279_mm512_reduce_max_ph(__m512h __V) {
3280 return __builtin_ia32_reduce_fmax_ph512(__V);
3281}
3282
3283static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3284_mm512_reduce_min_ph(__m512h __V) {
3285 return __builtin_ia32_reduce_fmin_ph512(__V);
3286}
3287
3288static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3289_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3290 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3291 (__v32hf)__A);
3292}
3293
3294static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3295_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3297 (__v32hi)__B);
3298}
3299
3300static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3301_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3303}
3304
3305// intrinsics below are alias for f*mul_*ch
3306#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3311 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3313 _mm512_maskz_fmul_round_pch(U, A, B, R)
3314
3315#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3320 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3322 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3323
3324#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3329 _mm_mask_fmul_round_sch(W, U, A, B, R)
3330#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3331
3332#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3337 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3339 _mm_maskz_fcmul_round_sch(U, A, B, R)
3340
3341#undef __DEFAULT_FN_ATTRS128
3342#undef __DEFAULT_FN_ATTRS256
3343#undef __DEFAULT_FN_ATTRS512
3344#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3345#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3346#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
3347
3348#endif
3349#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ _Float16
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
unsigned char __mmask8
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
unsigned short __mmask16
#define _MM_FROUND_CUR_DIRECTION
#define _CMP_GT_OS
Definition avxintrin.h:1572
#define _CMP_GE_OS
Definition avxintrin.h:1571
#define _CMP_GT_OQ
Definition avxintrin.h:1588
#define _CMP_LE_OQ
Definition avxintrin.h:1576
#define _CMP_LT_OQ
Definition avxintrin.h:1575
#define _CMP_NEQ_US
Definition avxintrin.h:1578
#define _CMP_EQ_OS
Definition avxintrin.h:1574
#define _CMP_GE_OQ
Definition avxintrin.h:1587
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1765
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1892
#define _CMP_LE_OS
Definition xmmintrin.h:3017
#define _CMP_NEQ_UQ
Definition xmmintrin.h:3019
#define _CMP_LT_OS
Definition xmmintrin.h:3016
#define _CMP_EQ_OQ
Definition xmmintrin.h:3015
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2014