clang 22.0.0git
avx512fp16intrin.h
Go to the documentation of this file.
1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23/* Define the default attributes for the functions in this file. */
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
33
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
38#else
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
42#endif
43
44static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
45 return __a[0];
46}
47
48static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setzero_ph(void) {
49 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
50}
51
52static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
53_mm256_setzero_ph(void) {
54 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
55 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
56}
57
58static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
59 return (__m256h)__builtin_ia32_undef256();
60}
61
62static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
63_mm512_setzero_ph(void) {
64 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
65 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
67}
68
69static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
70 return (__m128h)__builtin_ia32_undef128();
71}
72
73static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
74 return (__m512h)__builtin_ia32_undef512();
75}
76
77static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
78_mm512_set1_ph(_Float16 __h) {
79 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
80 __h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h};
83}
84
85static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
86_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
87 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
88 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
89 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
90 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
91 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
92 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
93 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
94 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
95 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
96 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
97 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
98 __h4, __h3, __h2, __h1};
99}
100
101static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setr_ph(
102 _Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4,
103 _Float16 e5, _Float16 e6, _Float16 e7, _Float16 e8, _Float16 e9,
104 _Float16 e10, _Float16 e11, _Float16 e12, _Float16 e13, _Float16 e14,
105 _Float16 e15, _Float16 e16, _Float16 e17, _Float16 e18, _Float16 e19,
106 _Float16 e20, _Float16 e21, _Float16 e22, _Float16 e23, _Float16 e24,
107 _Float16 e25, _Float16 e26, _Float16 e27, _Float16 e28, _Float16 e29,
108 _Float16 e30, _Float16 e31) {
109 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
110 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
111 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
112}
113
114static __inline __m512h __DEFAULT_FN_ATTRS512
115_mm512_set1_pch(_Float16 _Complex __h) {
116 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
117}
118
119static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
120 return (__m128)__a;
121}
122
123static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
124 return (__m256)__a;
125}
126
127static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
128 return (__m512)__a;
129}
130
131static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
132 return (__m128d)__a;
133}
134
135static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
136 return (__m256d)__a;
137}
138
139static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
140 return (__m512d)__a;
141}
142
143static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
144 return (__m128i)__a;
145}
146
147static __inline__ __m256i __DEFAULT_FN_ATTRS256
148_mm256_castph_si256(__m256h __a) {
149 return (__m256i)__a;
150}
151
152static __inline__ __m512i __DEFAULT_FN_ATTRS512
153_mm512_castph_si512(__m512h __a) {
154 return (__m512i)__a;
155}
156
157static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
158 return (__m128h)__a;
159}
160
161static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
162 return (__m256h)__a;
163}
164
165static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
166 return (__m512h)__a;
167}
168
169static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
170 return (__m128h)__a;
171}
172
173static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
174 return (__m256h)__a;
175}
176
177static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
178 return (__m512h)__a;
179}
180
181static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
182 return (__m128h)__a;
183}
184
185static __inline__ __m256h __DEFAULT_FN_ATTRS256
186_mm256_castsi256_ph(__m256i __a) {
187 return (__m256h)__a;
188}
189
190static __inline__ __m512h __DEFAULT_FN_ATTRS512
191_mm512_castsi512_ph(__m512i __a) {
192 return (__m512h)__a;
193}
194
195static __inline__ __m128h __DEFAULT_FN_ATTRS256
196_mm256_castph256_ph128(__m256h __a) {
197 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
198}
199
200static __inline__ __m128h __DEFAULT_FN_ATTRS512
201_mm512_castph512_ph128(__m512h __a) {
202 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
203}
204
205static __inline__ __m256h __DEFAULT_FN_ATTRS512
206_mm512_castph512_ph256(__m512h __a) {
207 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
208 12, 13, 14, 15);
209}
210
211static __inline__ __m256h __DEFAULT_FN_ATTRS256
212_mm256_castph128_ph256(__m128h __a) {
213 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
214 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
215}
216
217static __inline__ __m512h __DEFAULT_FN_ATTRS512
218_mm512_castph128_ph512(__m128h __a) {
219 __m256h __b = __builtin_nondeterministic_value(__b);
220 return __builtin_shufflevector(
221 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
222 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
223 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
224 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
225}
226
227static __inline__ __m512h __DEFAULT_FN_ATTRS512
228_mm512_castph256_ph512(__m256h __a) {
229 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
230 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
231 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
232 27, 28, 29, 30, 31);
233}
234
235/// Constructs a 256-bit floating-point vector of [16 x half] from a
236/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
237/// contain the value of the source vector. The upper 384 bits are set
238/// to zero.
239///
240/// \headerfile <x86intrin.h>
241///
242/// This intrinsic has no corresponding instruction.
243///
244/// \param __a
245/// A 128-bit vector of [8 x half].
246/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
247/// contain the value of the parameter. The upper 384 bits are set to zero.
248static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
249_mm256_zextph128_ph256(__m128h __a) {
250 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
251 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
252}
253
254/// Constructs a 512-bit floating-point vector of [32 x half] from a
255/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
256/// contain the value of the source vector. The upper 384 bits are set
257/// to zero.
258///
259/// \headerfile <x86intrin.h>
260///
261/// This intrinsic has no corresponding instruction.
262///
263/// \param __a
264/// A 128-bit vector of [8 x half].
265/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
266/// contain the value of the parameter. The upper 384 bits are set to zero.
267static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
268_mm512_zextph128_ph512(__m128h __a) {
269 return __builtin_shufflevector(
270 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
271 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
272}
273
274/// Constructs a 512-bit floating-point vector of [32 x half] from a
275/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
276/// contain the value of the source vector. The upper 256 bits are set
277/// to zero.
278///
279/// \headerfile <x86intrin.h>
280///
281/// This intrinsic has no corresponding instruction.
282///
283/// \param __a
284/// A 256-bit vector of [16 x half].
285/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
286/// contain the value of the parameter. The upper 256 bits are set to zero.
287static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
288_mm512_zextph256_ph512(__m256h __a) {
289 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
290 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
291 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
292 29, 30, 31);
293}
294
295#define _mm_comi_round_sh(A, B, P, R) \
296 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
297
298#define _mm_comi_sh(A, B, pred) \
299 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
300
301static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
302 __m128h __B) {
303 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
305}
306
307static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
308 __m128h __B) {
309 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
311}
312
313static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
314 __m128h __B) {
315 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
317}
318
319static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
320 __m128h __B) {
321 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
323}
324
325static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
326 __m128h __B) {
327 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
329}
330
331static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
332 __m128h __B) {
333 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
335}
336
337static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
338 __m128h __B) {
339 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
341}
342
343static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
344 __m128h __B) {
345 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
347}
348
349static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
350 __m128h __B) {
351 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
353}
354
355static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
356 __m128h __B) {
357 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
359}
360
361static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
362 __m128h __B) {
363 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
365}
366
367static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
368 __m128h __B) {
369 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
371}
372
373static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
374 __m512h __B) {
375 return (__m512h)((__v32hf)__A + (__v32hf)__B);
376}
377
378static __inline__ __m512h __DEFAULT_FN_ATTRS512
379_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
380 return (__m512h)__builtin_ia32_selectph_512(
381 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
382}
383
384static __inline__ __m512h __DEFAULT_FN_ATTRS512
385_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
386 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
387 (__v32hf)_mm512_add_ph(__A, __B),
388 (__v32hf)_mm512_setzero_ph());
389}
390
391#define _mm512_add_round_ph(A, B, R) \
392 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
393 (__v32hf)(__m512h)(B), (int)(R)))
394
395#define _mm512_mask_add_round_ph(W, U, A, B, R) \
396 ((__m512h)__builtin_ia32_selectph_512( \
397 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
398 (__v32hf)(__m512h)(W)))
399
400#define _mm512_maskz_add_round_ph(U, A, B, R) \
401 ((__m512h)__builtin_ia32_selectph_512( \
402 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
403 (__v32hf)_mm512_setzero_ph()))
404
405static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
406 __m512h __B) {
407 return (__m512h)((__v32hf)__A - (__v32hf)__B);
408}
409
410static __inline__ __m512h __DEFAULT_FN_ATTRS512
411_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
412 return (__m512h)__builtin_ia32_selectph_512(
413 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
414}
415
416static __inline__ __m512h __DEFAULT_FN_ATTRS512
417_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
418 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
419 (__v32hf)_mm512_sub_ph(__A, __B),
420 (__v32hf)_mm512_setzero_ph());
421}
422
423#define _mm512_sub_round_ph(A, B, R) \
424 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
425 (__v32hf)(__m512h)(B), (int)(R)))
426
427#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
428 ((__m512h)__builtin_ia32_selectph_512( \
429 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
430 (__v32hf)(__m512h)(W)))
431
432#define _mm512_maskz_sub_round_ph(U, A, B, R) \
433 ((__m512h)__builtin_ia32_selectph_512( \
434 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
435 (__v32hf)_mm512_setzero_ph()))
436
437static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
438 __m512h __B) {
439 return (__m512h)((__v32hf)__A * (__v32hf)__B);
440}
441
442static __inline__ __m512h __DEFAULT_FN_ATTRS512
443_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
444 return (__m512h)__builtin_ia32_selectph_512(
445 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
446}
447
448static __inline__ __m512h __DEFAULT_FN_ATTRS512
449_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
450 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
451 (__v32hf)_mm512_mul_ph(__A, __B),
452 (__v32hf)_mm512_setzero_ph());
453}
454
455#define _mm512_mul_round_ph(A, B, R) \
456 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
457 (__v32hf)(__m512h)(B), (int)(R)))
458
459#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
460 ((__m512h)__builtin_ia32_selectph_512( \
461 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
462 (__v32hf)(__m512h)(W)))
463
464#define _mm512_maskz_mul_round_ph(U, A, B, R) \
465 ((__m512h)__builtin_ia32_selectph_512( \
466 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
467 (__v32hf)_mm512_setzero_ph()))
468
469static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
470 __m512h __B) {
471 return (__m512h)((__v32hf)__A / (__v32hf)__B);
472}
473
474static __inline__ __m512h __DEFAULT_FN_ATTRS512
475_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
476 return (__m512h)__builtin_ia32_selectph_512(
477 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
478}
479
480static __inline__ __m512h __DEFAULT_FN_ATTRS512
481_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
482 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
483 (__v32hf)_mm512_div_ph(__A, __B),
484 (__v32hf)_mm512_setzero_ph());
485}
486
487#define _mm512_div_round_ph(A, B, R) \
488 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
489 (__v32hf)(__m512h)(B), (int)(R)))
490
491#define _mm512_mask_div_round_ph(W, U, A, B, R) \
492 ((__m512h)__builtin_ia32_selectph_512( \
493 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
494 (__v32hf)(__m512h)(W)))
495
496#define _mm512_maskz_div_round_ph(U, A, B, R) \
497 ((__m512h)__builtin_ia32_selectph_512( \
498 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
499 (__v32hf)_mm512_setzero_ph()))
500
501static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
502 __m512h __B) {
503 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
505}
506
507static __inline__ __m512h __DEFAULT_FN_ATTRS512
508_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
509 return (__m512h)__builtin_ia32_selectph_512(
510 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
511}
512
513static __inline__ __m512h __DEFAULT_FN_ATTRS512
514_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
515 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
516 (__v32hf)_mm512_min_ph(__A, __B),
517 (__v32hf)_mm512_setzero_ph());
518}
519
520#define _mm512_min_round_ph(A, B, R) \
521 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
522 (__v32hf)(__m512h)(B), (int)(R)))
523
524#define _mm512_mask_min_round_ph(W, U, A, B, R) \
525 ((__m512h)__builtin_ia32_selectph_512( \
526 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
527 (__v32hf)(__m512h)(W)))
528
529#define _mm512_maskz_min_round_ph(U, A, B, R) \
530 ((__m512h)__builtin_ia32_selectph_512( \
531 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
532 (__v32hf)_mm512_setzero_ph()))
533
534static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
535 __m512h __B) {
536 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
538}
539
540static __inline__ __m512h __DEFAULT_FN_ATTRS512
541_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
542 return (__m512h)__builtin_ia32_selectph_512(
543 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
544}
545
546static __inline__ __m512h __DEFAULT_FN_ATTRS512
547_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
548 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
549 (__v32hf)_mm512_max_ph(__A, __B),
550 (__v32hf)_mm512_setzero_ph());
551}
552
553#define _mm512_max_round_ph(A, B, R) \
554 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
555 (__v32hf)(__m512h)(B), (int)(R)))
556
557#define _mm512_mask_max_round_ph(W, U, A, B, R) \
558 ((__m512h)__builtin_ia32_selectph_512( \
559 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
560 (__v32hf)(__m512h)(W)))
561
562#define _mm512_maskz_max_round_ph(U, A, B, R) \
563 ((__m512h)__builtin_ia32_selectph_512( \
564 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
565 (__v32hf)_mm512_setzero_ph()))
566
567static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
568_mm512_abs_ph(__m512h __A) {
569 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
570}
571
572static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
573 return (__m512h)_mm512_xor_epi32((__m512i)__A,
574 _mm512_set1_epi32(-2147483648));
575}
576
577static __inline__ __m512h __DEFAULT_FN_ATTRS512
578_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
579 return (__m512h)__builtin_ia32_selectps_512(
580 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
581}
582
583static __inline__ __m512h __DEFAULT_FN_ATTRS512
584_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
585 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
586 (__v16sf)_mm512_conj_pch(__A),
587 (__v16sf)_mm512_setzero_ps());
588}
589
590static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
591 __m128h __B) {
592 __A[0] += __B[0];
593 return __A;
594}
595
596static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
597 __mmask8 __U,
598 __m128h __A,
599 __m128h __B) {
600 __A = _mm_add_sh(__A, __B);
601 return __builtin_ia32_selectsh_128(__U, __A, __W);
602}
603
604static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
605 __m128h __A,
606 __m128h __B) {
607 __A = _mm_add_sh(__A, __B);
608 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
609}
610
611#define _mm_add_round_sh(A, B, R) \
612 ((__m128h)__builtin_ia32_addsh_round_mask( \
613 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
614 (__mmask8)-1, (int)(R)))
615
616#define _mm_mask_add_round_sh(W, U, A, B, R) \
617 ((__m128h)__builtin_ia32_addsh_round_mask( \
618 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
619 (__mmask8)(U), (int)(R)))
620
621#define _mm_maskz_add_round_sh(U, A, B, R) \
622 ((__m128h)__builtin_ia32_addsh_round_mask( \
623 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
624 (__mmask8)(U), (int)(R)))
625
626static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
627 __m128h __B) {
628 __A[0] -= __B[0];
629 return __A;
630}
631
632static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
633 __mmask8 __U,
634 __m128h __A,
635 __m128h __B) {
636 __A = _mm_sub_sh(__A, __B);
637 return __builtin_ia32_selectsh_128(__U, __A, __W);
638}
639
640static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
641 __m128h __A,
642 __m128h __B) {
643 __A = _mm_sub_sh(__A, __B);
644 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
645}
646
647#define _mm_sub_round_sh(A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
650 (__mmask8)-1, (int)(R)))
651
652#define _mm_mask_sub_round_sh(W, U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
655 (__mmask8)(U), (int)(R)))
656
657#define _mm_maskz_sub_round_sh(U, A, B, R) \
658 ((__m128h)__builtin_ia32_subsh_round_mask( \
659 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
660 (__mmask8)(U), (int)(R)))
661
662static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
663 __m128h __B) {
664 __A[0] *= __B[0];
665 return __A;
666}
667
668static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
669 __mmask8 __U,
670 __m128h __A,
671 __m128h __B) {
672 __A = _mm_mul_sh(__A, __B);
673 return __builtin_ia32_selectsh_128(__U, __A, __W);
674}
675
676static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
677 __m128h __A,
678 __m128h __B) {
679 __A = _mm_mul_sh(__A, __B);
680 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
681}
682
683#define _mm_mul_round_sh(A, B, R) \
684 ((__m128h)__builtin_ia32_mulsh_round_mask( \
685 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
686 (__mmask8)-1, (int)(R)))
687
688#define _mm_mask_mul_round_sh(W, U, A, B, R) \
689 ((__m128h)__builtin_ia32_mulsh_round_mask( \
690 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
691 (__mmask8)(U), (int)(R)))
692
693#define _mm_maskz_mul_round_sh(U, A, B, R) \
694 ((__m128h)__builtin_ia32_mulsh_round_mask( \
695 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
696 (__mmask8)(U), (int)(R)))
697
698static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
699 __m128h __B) {
700 __A[0] /= __B[0];
701 return __A;
702}
703
704static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
705 __mmask8 __U,
706 __m128h __A,
707 __m128h __B) {
708 __A = _mm_div_sh(__A, __B);
709 return __builtin_ia32_selectsh_128(__U, __A, __W);
710}
711
712static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
713 __m128h __A,
714 __m128h __B) {
715 __A = _mm_div_sh(__A, __B);
716 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
717}
718
719#define _mm_div_round_sh(A, B, R) \
720 ((__m128h)__builtin_ia32_divsh_round_mask( \
721 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
722 (__mmask8)-1, (int)(R)))
723
724#define _mm_mask_div_round_sh(W, U, A, B, R) \
725 ((__m128h)__builtin_ia32_divsh_round_mask( \
726 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
727 (__mmask8)(U), (int)(R)))
728
729#define _mm_maskz_div_round_sh(U, A, B, R) \
730 ((__m128h)__builtin_ia32_divsh_round_mask( \
731 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
732 (__mmask8)(U), (int)(R)))
733
734static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
735 __m128h __B) {
736 return (__m128h)__builtin_ia32_minsh_round_mask(
737 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
739}
740
741static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
742 __mmask8 __U,
743 __m128h __A,
744 __m128h __B) {
745 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
746 (__v8hf)__W, (__mmask8)__U,
748}
749
750static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
751 __m128h __A,
752 __m128h __B) {
753 return (__m128h)__builtin_ia32_minsh_round_mask(
754 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
756}
757
758#define _mm_min_round_sh(A, B, R) \
759 ((__m128h)__builtin_ia32_minsh_round_mask( \
760 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
761 (__mmask8)-1, (int)(R)))
762
763#define _mm_mask_min_round_sh(W, U, A, B, R) \
764 ((__m128h)__builtin_ia32_minsh_round_mask( \
765 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
766 (__mmask8)(U), (int)(R)))
767
768#define _mm_maskz_min_round_sh(U, A, B, R) \
769 ((__m128h)__builtin_ia32_minsh_round_mask( \
770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
771 (__mmask8)(U), (int)(R)))
772
773static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
774 __m128h __B) {
775 return (__m128h)__builtin_ia32_maxsh_round_mask(
776 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
778}
779
780static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
781 __mmask8 __U,
782 __m128h __A,
783 __m128h __B) {
784 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
785 (__v8hf)__W, (__mmask8)__U,
787}
788
789static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
790 __m128h __A,
791 __m128h __B) {
792 return (__m128h)__builtin_ia32_maxsh_round_mask(
793 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
795}
796
797#define _mm_max_round_sh(A, B, R) \
798 ((__m128h)__builtin_ia32_maxsh_round_mask( \
799 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
800 (__mmask8)-1, (int)(R)))
801
802#define _mm_mask_max_round_sh(W, U, A, B, R) \
803 ((__m128h)__builtin_ia32_maxsh_round_mask( \
804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
805 (__mmask8)(U), (int)(R)))
806
807#define _mm_maskz_max_round_sh(U, A, B, R) \
808 ((__m128h)__builtin_ia32_maxsh_round_mask( \
809 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
810 (__mmask8)(U), (int)(R)))
811
812#define _mm512_cmp_round_ph_mask(A, B, P, R) \
813 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
814 (__v32hf)(__m512h)(B), (int)(P), \
815 (__mmask32)-1, (int)(R)))
816
817#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
818 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
819 (__v32hf)(__m512h)(B), (int)(P), \
820 (__mmask32)(U), (int)(R)))
821
822#define _mm512_cmp_ph_mask(A, B, P) \
823 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
824
825#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
826 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
827
828#define _mm_cmp_round_sh_mask(X, Y, P, R) \
829 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
830 (__v8hf)(__m128h)(Y), (int)(P), \
831 (__mmask8)-1, (int)(R)))
832
833#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
834 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
835 (__v8hf)(__m128h)(Y), (int)(P), \
836 (__mmask8)(M), (int)(R)))
837
838#define _mm_cmp_sh_mask(X, Y, P) \
839 ((__mmask8)__builtin_ia32_cmpsh_mask( \
840 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
841 _MM_FROUND_CUR_DIRECTION))
842
843#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
844 ((__mmask8)__builtin_ia32_cmpsh_mask( \
845 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
846 _MM_FROUND_CUR_DIRECTION))
847// loads with vmovsh:
848static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
849 struct __mm_load_sh_struct {
850 _Float16 __u;
851 } __attribute__((__packed__, __may_alias__));
852 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
853 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
854}
855
856static __inline__ __m128h __DEFAULT_FN_ATTRS128
857_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
858 __m128h src = (__v8hf)__builtin_shufflevector(
859 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
860
861 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
862}
863
864static __inline__ __m128h __DEFAULT_FN_ATTRS128
865_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
866 return (__m128h)__builtin_ia32_loadsh128_mask(
867 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
868}
869
870static __inline__ __m512h __DEFAULT_FN_ATTRS512
871_mm512_load_ph(void const *__p) {
872 return *(const __m512h *)__p;
873}
874
875static __inline__ __m256h __DEFAULT_FN_ATTRS256
876_mm256_load_ph(void const *__p) {
877 return *(const __m256h *)__p;
878}
879
880static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
881 return *(const __m128h *)__p;
882}
883
884static __inline__ __m512h __DEFAULT_FN_ATTRS512
885_mm512_loadu_ph(void const *__p) {
886 struct __loadu_ph {
887 __m512h_u __v;
888 } __attribute__((__packed__, __may_alias__));
889 return ((const struct __loadu_ph *)__p)->__v;
890}
891
892static __inline__ __m256h __DEFAULT_FN_ATTRS256
893_mm256_loadu_ph(void const *__p) {
894 struct __loadu_ph {
895 __m256h_u __v;
896 } __attribute__((__packed__, __may_alias__));
897 return ((const struct __loadu_ph *)__p)->__v;
898}
899
900static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
901 struct __loadu_ph {
902 __m128h_u __v;
903 } __attribute__((__packed__, __may_alias__));
904 return ((const struct __loadu_ph *)__p)->__v;
905}
906
907// stores with vmovsh:
908static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
909 __m128h __a) {
910 struct __mm_store_sh_struct {
911 _Float16 __u;
912 } __attribute__((__packed__, __may_alias__));
913 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
914}
915
916static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
917 __mmask8 __U,
918 __m128h __A) {
919 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
920}
921
922static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
923 __m512h __A) {
924 *(__m512h *)__P = __A;
925}
926
927static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
928 __m256h __A) {
929 *(__m256h *)__P = __A;
930}
931
932static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
933 __m128h __A) {
934 *(__m128h *)__P = __A;
935}
936
937static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
938 __m512h __A) {
939 struct __storeu_ph {
940 __m512h_u __v;
941 } __attribute__((__packed__, __may_alias__));
942 ((struct __storeu_ph *)__P)->__v = __A;
943}
944
945static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
946 __m256h __A) {
947 struct __storeu_ph {
948 __m256h_u __v;
949 } __attribute__((__packed__, __may_alias__));
950 ((struct __storeu_ph *)__P)->__v = __A;
951}
952
953static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
954 __m128h __A) {
955 struct __storeu_ph {
956 __m128h_u __v;
957 } __attribute__((__packed__, __may_alias__));
958 ((struct __storeu_ph *)__P)->__v = __A;
959}
960
961// moves with vmovsh:
962static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
963 __m128h __b) {
964 __a[0] = __b[0];
965 return __a;
966}
967
968static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
969 __mmask8 __U,
970 __m128h __A,
971 __m128h __B) {
972 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
973}
974
975static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
976 __m128h __A,
977 __m128h __B) {
978 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
979 _mm_setzero_ph());
980}
981
982// vmovw:
983static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
984 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
985}
986
987static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
988 __v8hi __b = (__v8hi)__a;
989 return __b[0];
990}
991
992static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
993 return (__m512h)__builtin_ia32_rcpph512_mask(
994 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
995}
996
997static __inline__ __m512h __DEFAULT_FN_ATTRS512
998_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
999 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
1000 (__mmask32)__U);
1001}
1002
1003static __inline__ __m512h __DEFAULT_FN_ATTRS512
1004_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
1005 return (__m512h)__builtin_ia32_rcpph512_mask(
1006 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1007}
1008
1009static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
1010 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1011 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
1012}
1013
1014static __inline__ __m512h __DEFAULT_FN_ATTRS512
1015_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1016 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1017 (__mmask32)__U);
1018}
1019
1020static __inline__ __m512h __DEFAULT_FN_ATTRS512
1021_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1022 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1023 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1024}
1025
1026#define _mm512_getmant_ph(A, B, C) \
1027 ((__m512h)__builtin_ia32_getmantph512_mask( \
1028 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1029 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1030 _MM_FROUND_CUR_DIRECTION))
1031
1032#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1033 ((__m512h)__builtin_ia32_getmantph512_mask( \
1034 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1035 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1036
1037#define _mm512_maskz_getmant_ph(U, A, B, C) \
1038 ((__m512h)__builtin_ia32_getmantph512_mask( \
1039 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1040 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1041
1042#define _mm512_getmant_round_ph(A, B, C, R) \
1043 ((__m512h)__builtin_ia32_getmantph512_mask( \
1044 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1045 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1046
1047#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1048 ((__m512h)__builtin_ia32_getmantph512_mask( \
1049 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1050 (__mmask32)(U), (int)(R)))
1051
1052#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1053 ((__m512h)__builtin_ia32_getmantph512_mask( \
1054 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1055 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1056
1057static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1058 return (__m512h)__builtin_ia32_getexpph512_mask(
1059 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1061}
1062
1063static __inline__ __m512h __DEFAULT_FN_ATTRS512
1064_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1065 return (__m512h)__builtin_ia32_getexpph512_mask(
1066 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1067}
1068
1069static __inline__ __m512h __DEFAULT_FN_ATTRS512
1070_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1071 return (__m512h)__builtin_ia32_getexpph512_mask(
1072 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1074}
1075
1076#define _mm512_getexp_round_ph(A, R) \
1077 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1078 (__v32hf)_mm512_undefined_ph(), \
1079 (__mmask32)-1, (int)(R)))
1080
1081#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1082 ((__m512h)__builtin_ia32_getexpph512_mask( \
1083 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1084
1085#define _mm512_maskz_getexp_round_ph(U, A, R) \
1086 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1087 (__v32hf)_mm512_setzero_ph(), \
1088 (__mmask32)(U), (int)(R)))
1089
1090static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1091 __m512h __B) {
1092 return (__m512h)__builtin_ia32_scalefph512_mask(
1093 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1095}
1096
1097static __inline__ __m512h __DEFAULT_FN_ATTRS512
1098_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1099 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1100 (__v32hf)__W, (__mmask32)__U,
1102}
1103
1104static __inline__ __m512h __DEFAULT_FN_ATTRS512
1105_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1106 return (__m512h)__builtin_ia32_scalefph512_mask(
1107 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1109}
1110
1111#define _mm512_scalef_round_ph(A, B, R) \
1112 ((__m512h)__builtin_ia32_scalefph512_mask( \
1113 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1114 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1115
1116#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1117 ((__m512h)__builtin_ia32_scalefph512_mask( \
1118 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1119 (__mmask32)(U), (int)(R)))
1120
1121#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1122 ((__m512h)__builtin_ia32_scalefph512_mask( \
1123 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1124 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1125
1126#define _mm512_roundscale_ph(A, B) \
1127 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1128 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1129 _MM_FROUND_CUR_DIRECTION))
1130
1131#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1132 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1133 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1134 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1135
1136#define _mm512_maskz_roundscale_ph(A, B, imm) \
1137 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1138 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1139 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1140
1141#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1142 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1143 (__v32hf)(__m512h)(A), \
1144 (__mmask32)(B), (int)(R)))
1145
1146#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1147 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1148 (__v32hf)_mm512_setzero_ph(), \
1149 (__mmask32)(A), (int)(R)))
1150
1151#define _mm512_roundscale_round_ph(A, imm, R) \
1152 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1153 (__v32hf)_mm512_undefined_ph(), \
1154 (__mmask32)-1, (int)(R)))
1155
1156#define _mm512_reduce_ph(A, imm) \
1157 ((__m512h)__builtin_ia32_reduceph512_mask( \
1158 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1159 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1160
1161#define _mm512_mask_reduce_ph(W, U, A, imm) \
1162 ((__m512h)__builtin_ia32_reduceph512_mask( \
1163 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1164 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1165
1166#define _mm512_maskz_reduce_ph(U, A, imm) \
1167 ((__m512h)__builtin_ia32_reduceph512_mask( \
1168 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1169 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1170
1171#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1172 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1173 (__v32hf)(__m512h)(W), \
1174 (__mmask32)(U), (int)(R)))
1175
1176#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1177 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1178 (__v32hf)_mm512_setzero_ph(), \
1179 (__mmask32)(U), (int)(R)))
1180
1181#define _mm512_reduce_round_ph(A, imm, R) \
1182 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1183 (__v32hf)_mm512_undefined_ph(), \
1184 (__mmask32)-1, (int)(R)))
1185
1186static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1187 __m128h __B) {
1188 return (__m128h)__builtin_ia32_rcpsh_mask(
1189 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1190}
1191
1192static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1193 __mmask8 __U,
1194 __m128h __A,
1195 __m128h __B) {
1196 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1197 (__v8hf)__W, (__mmask8)__U);
1198}
1199
1200static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1201 __m128h __A,
1202 __m128h __B) {
1203 return (__m128h)__builtin_ia32_rcpsh_mask(
1204 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1205}
1206
1207static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1208 __m128h __B) {
1209 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1210 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1211}
1212
1213static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1214 __mmask8 __U,
1215 __m128h __A,
1216 __m128h __B) {
1217 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1218 (__v8hf)__W, (__mmask8)__U);
1219}
1220
1221static __inline__ __m128h __DEFAULT_FN_ATTRS128
1222_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1223 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1224 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1225}
1226
1227#define _mm_getmant_round_sh(A, B, C, D, R) \
1228 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1229 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1230 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1231
1232#define _mm_getmant_sh(A, B, C, D) \
1233 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1234 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1235 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1236
1237#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1238 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1239 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1240 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1241
1242#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1243 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1244 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1245 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1246
1247#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1248 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1249 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1250 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1251
1252#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1253 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1254 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1255 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1256
1257#define _mm_getexp_round_sh(A, B, R) \
1258 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1259 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1260 (__mmask8)-1, (int)(R)))
1261
1262static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1263 __m128h __B) {
1264 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1265 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1267}
1268
1269static __inline__ __m128h __DEFAULT_FN_ATTRS128
1270_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1271 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1272 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1274}
1275
1276#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1277 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1278 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1279 (__mmask8)(U), (int)(R)))
1280
1281static __inline__ __m128h __DEFAULT_FN_ATTRS128
1282_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1283 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1284 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1286}
1287
1288#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1289 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1290 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1291 (__mmask8)(U), (int)(R)))
1292
1293#define _mm_scalef_round_sh(A, B, R) \
1294 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1295 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1296 (__mmask8)-1, (int)(R)))
1297
1298static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1299 __m128h __B) {
1300 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1301 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1303}
1304
1305static __inline__ __m128h __DEFAULT_FN_ATTRS128
1306_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1307 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1308 (__v8hf)__W, (__mmask8)__U,
1310}
1311
1312#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1313 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1314 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1315 (__mmask8)(U), (int)(R)))
1316
1317static __inline__ __m128h __DEFAULT_FN_ATTRS128
1318_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1319 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1320 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1322}
1323
1324#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1325 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1326 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1327 (__mmask8)(U), (int)(R)))
1328
1329#define _mm_roundscale_round_sh(A, B, imm, R) \
1330 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1331 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1332 (__mmask8)-1, (int)(imm), (int)(R)))
1333
1334#define _mm_roundscale_sh(A, B, imm) \
1335 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1336 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1337 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1338
1339#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1340 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1341 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1342 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1343
1344#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1345 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1346 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1347 (__mmask8)(U), (int)(I), (int)(R)))
1348
1349#define _mm_maskz_roundscale_sh(U, A, B, I) \
1350 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1351 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1352 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1353
1354#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1355 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1356 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1357 (__mmask8)(U), (int)(I), (int)(R)))
1358
1359#define _mm_reduce_sh(A, B, C) \
1360 ((__m128h)__builtin_ia32_reducesh_mask( \
1361 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1362 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1363
1364#define _mm_mask_reduce_sh(W, U, A, B, C) \
1365 ((__m128h)__builtin_ia32_reducesh_mask( \
1366 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1367 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1368
1369#define _mm_maskz_reduce_sh(U, A, B, C) \
1370 ((__m128h)__builtin_ia32_reducesh_mask( \
1371 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1372 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1373
1374#define _mm_reduce_round_sh(A, B, C, R) \
1375 ((__m128h)__builtin_ia32_reducesh_mask( \
1376 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1377 (__mmask8)-1, (int)(C), (int)(R)))
1378
1379#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1380 ((__m128h)__builtin_ia32_reducesh_mask( \
1381 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1382 (__mmask8)(U), (int)(C), (int)(R)))
1383
1384#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1385 ((__m128h)__builtin_ia32_reducesh_mask( \
1386 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1387 (__mmask8)(U), (int)(C), (int)(R)))
1388
1389#define _mm512_sqrt_round_ph(A, R) \
1390 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1391
1392#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1393 ((__m512h)__builtin_ia32_selectph_512( \
1394 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1395 (__v32hf)(__m512h)(W)))
1396
1397#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1398 ((__m512h)__builtin_ia32_selectph_512( \
1399 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1400 (__v32hf)_mm512_setzero_ph()))
1401
1402static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1403 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1405}
1406
1407static __inline__ __m512h __DEFAULT_FN_ATTRS512
1408_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1409 return (__m512h)__builtin_ia32_selectph_512(
1410 (__mmask32)(__U),
1411 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1412 (__v32hf)(__m512h)(__W));
1413}
1414
1415static __inline__ __m512h __DEFAULT_FN_ATTRS512
1416_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1417 return (__m512h)__builtin_ia32_selectph_512(
1418 (__mmask32)(__U),
1419 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1420 (__v32hf)_mm512_setzero_ph());
1421}
1422
1423#define _mm_sqrt_round_sh(A, B, R) \
1424 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1425 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1426 (__mmask8)-1, (int)(R)))
1427
1428#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1429 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1430 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1431 (__mmask8)(U), (int)(R)))
1432
1433#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1434 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1435 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1436 (__mmask8)(U), (int)(R)))
1437
1438static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1439 __m128h __B) {
1440 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1441 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1443}
1444
1445static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1446 __mmask32 __U,
1447 __m128h __A,
1448 __m128h __B) {
1449 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1450 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1452}
1453
1454static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1455 __m128h __A,
1456 __m128h __B) {
1457 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1458 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1460}
1461
1462#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1463 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1464 (int)(imm), (__mmask32)(U)))
1465
1466#define _mm512_fpclass_ph_mask(A, imm) \
1467 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1468 (int)(imm), (__mmask32)-1))
1469
1470#define _mm_fpclass_sh_mask(A, imm) \
1471 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1472 (__mmask8)-1))
1473
1474#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1475 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1476 (__mmask8)(U)))
1477
1478#define _mm512_cvt_roundpd_ph(A, R) \
1479 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1480 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1481
1482#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1483 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1484 (__mmask8)(U), (int)(R)))
1485
1486#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1487 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1488 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1489
1490static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1491 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1492 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1494}
1495
1496static __inline__ __m128h __DEFAULT_FN_ATTRS512
1497_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1498 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1499 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1500}
1501
1502static __inline__ __m128h __DEFAULT_FN_ATTRS512
1503_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1504 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1505 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1507}
1508
1509#define _mm512_cvt_roundph_pd(A, R) \
1510 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1511 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1512
1513#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1514 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1515 (__mmask8)(U), (int)(R)))
1516
1517#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1518 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1519 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1520
1521static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1522 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1523 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1525}
1526
1527static __inline__ __m512d __DEFAULT_FN_ATTRS512
1528_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1529 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1530 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1531}
1532
1533static __inline__ __m512d __DEFAULT_FN_ATTRS512
1534_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1535 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1536 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1538}
1539
1540#define _mm_cvt_roundsh_ss(A, B, R) \
1541 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1542 (__v4sf)_mm_undefined_ps(), \
1543 (__mmask8)(-1), (int)(R)))
1544
1545#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1546 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1547 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1548
1549#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1550 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1551 (__v4sf)_mm_setzero_ps(), \
1552 (__mmask8)(U), (int)(R)))
1553
1554static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1555 __m128h __B) {
1556 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1557 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1559}
1560
1561static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1562 __mmask8 __U,
1563 __m128 __A,
1564 __m128h __B) {
1565 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1566 (__v4sf)__W, (__mmask8)__U,
1568}
1569
1570static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1571 __m128 __A,
1572 __m128h __B) {
1573 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1574 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1576}
1577
1578#define _mm_cvt_roundss_sh(A, B, R) \
1579 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1580 (__v8hf)_mm_undefined_ph(), \
1581 (__mmask8)(-1), (int)(R)))
1582
1583#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1584 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1585 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1586
1587#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1588 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1589 (__v8hf)_mm_setzero_ph(), \
1590 (__mmask8)(U), (int)(R)))
1591
1592static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1593 __m128 __B) {
1594 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1595 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1597}
1598
1599static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1600 __mmask8 __U,
1601 __m128h __A,
1602 __m128 __B) {
1603 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1604 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1606}
1607
1608static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1609 __m128h __A,
1610 __m128 __B) {
1611 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1612 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1614}
1615
1616#define _mm_cvt_roundsd_sh(A, B, R) \
1617 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1618 (__v8hf)_mm_undefined_ph(), \
1619 (__mmask8)(-1), (int)(R)))
1620
1621#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1622 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1623 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1624
1625#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1626 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1627 (__v8hf)_mm_setzero_ph(), \
1628 (__mmask8)(U), (int)(R)))
1629
1630static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1631 __m128d __B) {
1632 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1633 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1635}
1636
1637static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1638 __mmask8 __U,
1639 __m128h __A,
1640 __m128d __B) {
1641 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1642 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1644}
1645
1646static __inline__ __m128h __DEFAULT_FN_ATTRS128
1647_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1648 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1649 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1651}
1652
1653#define _mm_cvt_roundsh_sd(A, B, R) \
1654 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1655 (__v2df)_mm_undefined_pd(), \
1656 (__mmask8)(-1), (int)(R)))
1657
1658#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1659 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1660 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1661
1662#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1663 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1664 (__v2df)_mm_setzero_pd(), \
1665 (__mmask8)(U), (int)(R)))
1666
1667static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1668 __m128h __B) {
1669 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1670 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1672}
1673
1674static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1675 __mmask8 __U,
1676 __m128d __A,
1677 __m128h __B) {
1678 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1679 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1681}
1682
1683static __inline__ __m128d __DEFAULT_FN_ATTRS128
1684_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1685 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1686 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1688}
1689
1690#define _mm512_cvt_roundph_epi16(A, R) \
1691 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1692 (__v32hi)_mm512_undefined_epi32(), \
1693 (__mmask32)(-1), (int)(R)))
1694
1695#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1696 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1697 (__mmask32)(U), (int)(R)))
1698
1699#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1700 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1701 (__v32hi)_mm512_setzero_epi32(), \
1702 (__mmask32)(U), (int)(R)))
1703
1704static __inline__ __m512i __DEFAULT_FN_ATTRS512
1705_mm512_cvtph_epi16(__m512h __A) {
1706 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1707 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1709}
1710
1711static __inline__ __m512i __DEFAULT_FN_ATTRS512
1712_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1713 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1714 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1715}
1716
1717static __inline__ __m512i __DEFAULT_FN_ATTRS512
1718_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1719 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1720 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1722}
1723
1724#define _mm512_cvtt_roundph_epi16(A, R) \
1725 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1726 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1727 (int)(R)))
1728
1729#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1730 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1731 (__mmask32)(U), (int)(R)))
1732
1733#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1734 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1735 (__v32hi)_mm512_setzero_epi32(), \
1736 (__mmask32)(U), (int)(R)))
1737
1738static __inline__ __m512i __DEFAULT_FN_ATTRS512
1739_mm512_cvttph_epi16(__m512h __A) {
1740 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1741 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1743}
1744
1745static __inline__ __m512i __DEFAULT_FN_ATTRS512
1746_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1747 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1748 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1749}
1750
1751static __inline__ __m512i __DEFAULT_FN_ATTRS512
1752_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1753 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1754 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1756}
1757
1758#define _mm512_cvt_roundepi16_ph(A, R) \
1759 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1760 (__v32hf)_mm512_undefined_ph(), \
1761 (__mmask32)(-1), (int)(R)))
1762
1763#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1764 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1765 (__mmask32)(U), (int)(R)))
1766
1767#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1768 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1769 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1770
1771static __inline__ __m512h __DEFAULT_FN_ATTRS512
1772_mm512_cvtepi16_ph(__m512i __A) {
1773 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1774 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1776}
1777
1778static __inline__ __m512h __DEFAULT_FN_ATTRS512
1779_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1780 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1781 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1782}
1783
1784static __inline__ __m512h __DEFAULT_FN_ATTRS512
1785_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1786 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1787 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1789}
1790
1791#define _mm512_cvt_roundph_epu16(A, R) \
1792 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1793 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1794 (int)(R)))
1795
1796#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1797 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1798 (__mmask32)(U), (int)(R)))
1799
1800#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1801 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1802 (__v32hu)_mm512_setzero_epi32(), \
1803 (__mmask32)(U), (int)(R)))
1804
1805static __inline__ __m512i __DEFAULT_FN_ATTRS512
1806_mm512_cvtph_epu16(__m512h __A) {
1807 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1808 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1810}
1811
1812static __inline__ __m512i __DEFAULT_FN_ATTRS512
1813_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1814 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1815 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1816}
1817
1818static __inline__ __m512i __DEFAULT_FN_ATTRS512
1819_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1820 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1821 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1823}
1824
1825#define _mm512_cvtt_roundph_epu16(A, R) \
1826 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1827 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1828 (int)(R)))
1829
1830#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1831 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1832 (__mmask32)(U), (int)(R)))
1833
1834#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1835 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1836 (__v32hu)_mm512_setzero_epi32(), \
1837 (__mmask32)(U), (int)(R)))
1838
1839static __inline__ __m512i __DEFAULT_FN_ATTRS512
1840_mm512_cvttph_epu16(__m512h __A) {
1841 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1842 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1844}
1845
1846static __inline__ __m512i __DEFAULT_FN_ATTRS512
1847_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1848 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1849 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1850}
1851
1852static __inline__ __m512i __DEFAULT_FN_ATTRS512
1853_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1854 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1855 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1857}
1858
1859#define _mm512_cvt_roundepu16_ph(A, R) \
1860 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1861 (__v32hf)_mm512_undefined_ph(), \
1862 (__mmask32)(-1), (int)(R)))
1863
1864#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1865 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1866 (__mmask32)(U), (int)(R)))
1867
1868#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1869 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1870 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1871
1872static __inline__ __m512h __DEFAULT_FN_ATTRS512
1873_mm512_cvtepu16_ph(__m512i __A) {
1874 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1875 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1877}
1878
1879static __inline__ __m512h __DEFAULT_FN_ATTRS512
1880_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1881 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1882 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1883}
1884
1885static __inline__ __m512h __DEFAULT_FN_ATTRS512
1886_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1887 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1888 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1890}
1891
1892#define _mm512_cvt_roundph_epi32(A, R) \
1893 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1894 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1895 (int)(R)))
1896
1897#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1898 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1899 (__mmask16)(U), (int)(R)))
1900
1901#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1902 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1903 (__v16si)_mm512_setzero_epi32(), \
1904 (__mmask16)(U), (int)(R)))
1905
1906static __inline__ __m512i __DEFAULT_FN_ATTRS512
1907_mm512_cvtph_epi32(__m256h __A) {
1908 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1909 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1911}
1912
1913static __inline__ __m512i __DEFAULT_FN_ATTRS512
1914_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1915 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1916 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1917}
1918
1919static __inline__ __m512i __DEFAULT_FN_ATTRS512
1920_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1921 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1922 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1924}
1925
1926#define _mm512_cvt_roundph_epu32(A, R) \
1927 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1928 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1929 (int)(R)))
1930
1931#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1932 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1933 (__mmask16)(U), (int)(R)))
1934
1935#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1936 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1937 (__v16su)_mm512_setzero_epi32(), \
1938 (__mmask16)(U), (int)(R)))
1939
1940static __inline__ __m512i __DEFAULT_FN_ATTRS512
1941_mm512_cvtph_epu32(__m256h __A) {
1942 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1943 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1945}
1946
1947static __inline__ __m512i __DEFAULT_FN_ATTRS512
1948_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1949 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1950 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1951}
1952
1953static __inline__ __m512i __DEFAULT_FN_ATTRS512
1954_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1955 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1956 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1958}
1959
1960#define _mm512_cvt_roundepi32_ph(A, R) \
1961 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1962 (__v16hf)_mm256_undefined_ph(), \
1963 (__mmask16)(-1), (int)(R)))
1964
1965#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1966 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1967 (__mmask16)(U), (int)(R)))
1968
1969#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1970 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1971 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1972
1973static __inline__ __m256h __DEFAULT_FN_ATTRS512
1974_mm512_cvtepi32_ph(__m512i __A) {
1975 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1976 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1978}
1979
1980static __inline__ __m256h __DEFAULT_FN_ATTRS512
1981_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1982 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1983 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1984}
1985
1986static __inline__ __m256h __DEFAULT_FN_ATTRS512
1987_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1988 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1989 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1991}
1992
1993#define _mm512_cvt_roundepu32_ph(A, R) \
1994 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1995 (__v16hf)_mm256_undefined_ph(), \
1996 (__mmask16)(-1), (int)(R)))
1997
1998#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1999 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
2000 (__mmask16)(U), (int)(R)))
2001
2002#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
2003 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
2004 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2005
2006static __inline__ __m256h __DEFAULT_FN_ATTRS512
2007_mm512_cvtepu32_ph(__m512i __A) {
2008 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2009 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2011}
2012
2013static __inline__ __m256h __DEFAULT_FN_ATTRS512
2014_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
2015 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2016 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2017}
2018
2019static __inline__ __m256h __DEFAULT_FN_ATTRS512
2020_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2021 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2022 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2024}
2025
2026#define _mm512_cvtt_roundph_epi32(A, R) \
2027 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2028 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2029 (int)(R)))
2030
2031#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2032 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2033 (__mmask16)(U), (int)(R)))
2034
2035#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2036 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2037 (__v16si)_mm512_setzero_epi32(), \
2038 (__mmask16)(U), (int)(R)))
2039
2040static __inline__ __m512i __DEFAULT_FN_ATTRS512
2041_mm512_cvttph_epi32(__m256h __A) {
2042 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2043 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2045}
2046
2047static __inline__ __m512i __DEFAULT_FN_ATTRS512
2048_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2049 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2050 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2051}
2052
2053static __inline__ __m512i __DEFAULT_FN_ATTRS512
2054_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2055 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2056 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2058}
2059
2060#define _mm512_cvtt_roundph_epu32(A, R) \
2061 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2062 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2063 (int)(R)))
2064
2065#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2066 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2067 (__mmask16)(U), (int)(R)))
2068
2069#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2070 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2071 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2072 (int)(R)))
2073
2074static __inline__ __m512i __DEFAULT_FN_ATTRS512
2075_mm512_cvttph_epu32(__m256h __A) {
2076 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2077 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2079}
2080
2081static __inline__ __m512i __DEFAULT_FN_ATTRS512
2082_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2083 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2084 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2085}
2086
2087static __inline__ __m512i __DEFAULT_FN_ATTRS512
2088_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2089 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2090 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2092}
2093
2094#define _mm512_cvt_roundepi64_ph(A, R) \
2095 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2096 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2097
2098#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2099 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2100 (__mmask8)(U), (int)(R)))
2101
2102#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2103 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2104 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2105
2106static __inline__ __m128h __DEFAULT_FN_ATTRS512
2107_mm512_cvtepi64_ph(__m512i __A) {
2108 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2109 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2111}
2112
2113static __inline__ __m128h __DEFAULT_FN_ATTRS512
2114_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2115 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2116 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2117}
2118
2119static __inline__ __m128h __DEFAULT_FN_ATTRS512
2120_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2121 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2122 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2124}
2125
2126#define _mm512_cvt_roundph_epi64(A, R) \
2127 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2128 (__v8di)_mm512_undefined_epi32(), \
2129 (__mmask8)(-1), (int)(R)))
2130
2131#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2132 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2133 (__mmask8)(U), (int)(R)))
2134
2135#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2136 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2137 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2138
2139static __inline__ __m512i __DEFAULT_FN_ATTRS512
2140_mm512_cvtph_epi64(__m128h __A) {
2141 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2142 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2144}
2145
2146static __inline__ __m512i __DEFAULT_FN_ATTRS512
2147_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2148 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2149 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2150}
2151
2152static __inline__ __m512i __DEFAULT_FN_ATTRS512
2153_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2154 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2155 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2157}
2158
2159#define _mm512_cvt_roundepu64_ph(A, R) \
2160 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2161 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2162
2163#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2164 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2165 (__mmask8)(U), (int)(R)))
2166
2167#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2168 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2169 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2170
2171static __inline__ __m128h __DEFAULT_FN_ATTRS512
2172_mm512_cvtepu64_ph(__m512i __A) {
2173 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2174 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2176}
2177
2178static __inline__ __m128h __DEFAULT_FN_ATTRS512
2179_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2180 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2181 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2182}
2183
2184static __inline__ __m128h __DEFAULT_FN_ATTRS512
2185_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2186 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2187 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2189}
2190
2191#define _mm512_cvt_roundph_epu64(A, R) \
2192 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2193 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2194 (int)(R)))
2195
2196#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2197 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2198 (__mmask8)(U), (int)(R)))
2199
2200#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2201 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2202 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2203
2204static __inline__ __m512i __DEFAULT_FN_ATTRS512
2205_mm512_cvtph_epu64(__m128h __A) {
2206 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2207 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2209}
2210
2211static __inline__ __m512i __DEFAULT_FN_ATTRS512
2212_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2213 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2214 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2215}
2216
2217static __inline__ __m512i __DEFAULT_FN_ATTRS512
2218_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2219 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2220 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2222}
2223
2224#define _mm512_cvtt_roundph_epi64(A, R) \
2225 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2226 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2227 (int)(R)))
2228
2229#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2230 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2231 (__mmask8)(U), (int)(R)))
2232
2233#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2235 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2236
2237static __inline__ __m512i __DEFAULT_FN_ATTRS512
2238_mm512_cvttph_epi64(__m128h __A) {
2239 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2240 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2242}
2243
2244static __inline__ __m512i __DEFAULT_FN_ATTRS512
2245_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2246 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2247 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2248}
2249
2250static __inline__ __m512i __DEFAULT_FN_ATTRS512
2251_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2252 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2253 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2255}
2256
2257#define _mm512_cvtt_roundph_epu64(A, R) \
2258 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2259 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2260 (int)(R)))
2261
2262#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2263 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2264 (__mmask8)(U), (int)(R)))
2265
2266#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2267 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2268 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2269
2270static __inline__ __m512i __DEFAULT_FN_ATTRS512
2271_mm512_cvttph_epu64(__m128h __A) {
2272 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2273 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2275}
2276
2277static __inline__ __m512i __DEFAULT_FN_ATTRS512
2278_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2279 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2280 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2281}
2282
2283static __inline__ __m512i __DEFAULT_FN_ATTRS512
2284_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2285 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2286 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2288}
2289
2290#define _mm_cvt_roundsh_i32(A, R) \
2291 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2292
2293static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2294 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2295}
2296
2297#define _mm_cvt_roundsh_u32(A, R) \
2298 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2299
2300static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2301_mm_cvtsh_u32(__m128h __A) {
2302 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2304}
2305
2306#ifdef __x86_64__
2307#define _mm_cvt_roundsh_i64(A, R) \
2308 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2309
2310static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2311 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2313}
2314
2315#define _mm_cvt_roundsh_u64(A, R) \
2316 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2317
2318static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2319_mm_cvtsh_u64(__m128h __A) {
2320 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2321 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2322}
2323#endif // __x86_64__
2324
2325#define _mm_cvt_roundu32_sh(A, B, R) \
2326 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2327
2328static __inline__ __m128h __DEFAULT_FN_ATTRS128
2329_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2330 __A[0] = __B;
2331 return __A;
2332}
2333
2334#ifdef __x86_64__
2335#define _mm_cvt_roundu64_sh(A, B, R) \
2336 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2337 (int)(R)))
2338
2339static __inline__ __m128h __DEFAULT_FN_ATTRS128
2340_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2341 __A[0] = __B;
2342 return __A;
2343}
2344#endif
2345
2346#define _mm_cvt_roundi32_sh(A, B, R) \
2347 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2348
2349static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2350 int __B) {
2351 __A[0] = __B;
2352 return __A;
2353}
2354
2355#ifdef __x86_64__
2356#define _mm_cvt_roundi64_sh(A, B, R) \
2357 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2358
2359static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2360 long long __B) {
2361 __A[0] = __B;
2362 return __A;
2363}
2364#endif
2365
2366#define _mm_cvtt_roundsh_i32(A, R) \
2367 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2368
2369static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2370 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2372}
2373
2374#ifdef __x86_64__
2375#define _mm_cvtt_roundsh_i64(A, R) \
2376 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2377
2378static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2379 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2381}
2382#endif
2383
2384#define _mm_cvtt_roundsh_u32(A, R) \
2385 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2386
2387static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2388_mm_cvttsh_u32(__m128h __A) {
2389 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2391}
2392
2393#ifdef __x86_64__
2394#define _mm_cvtt_roundsh_u64(A, R) \
2395 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2396
2397static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2398_mm_cvttsh_u64(__m128h __A) {
2399 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2400 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2401}
2402#endif
2403
2404#define _mm512_cvtx_roundph_ps(A, R) \
2405 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2406 (__v16sf)_mm512_undefined_ps(), \
2407 (__mmask16)(-1), (int)(R)))
2408
2409#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2410 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2411 (__mmask16)(U), (int)(R)))
2412
2413#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2414 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2415 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2416
2417static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2418 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2419 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2421}
2422
2423static __inline__ __m512 __DEFAULT_FN_ATTRS512
2424_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2425 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2426 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2427}
2428
2429static __inline__ __m512 __DEFAULT_FN_ATTRS512
2430_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2431 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2432 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2434}
2435
2436#define _mm512_cvtx_roundps_ph(A, R) \
2437 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2438 (__v16hf)_mm256_undefined_ph(), \
2439 (__mmask16)(-1), (int)(R)))
2440
2441#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2442 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2443 (__mmask16)(U), (int)(R)))
2444
2445#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2446 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2447 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2448
2449static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2450 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2451 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2453}
2454
2455static __inline__ __m256h __DEFAULT_FN_ATTRS512
2456_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2457 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2458 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2459}
2460
2461static __inline__ __m256h __DEFAULT_FN_ATTRS512
2462_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2463 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2464 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2466}
2467
2468#define _mm512_fmadd_round_ph(A, B, C, R) \
2469 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2470 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2471 (__mmask32)-1, (int)(R)))
2472
2473#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2474 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2475 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2476 (__mmask32)(U), (int)(R)))
2477
2478#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2479 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2480 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2481 (__mmask32)(U), (int)(R)))
2482
2483#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2484 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2485 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2486 (__mmask32)(U), (int)(R)))
2487
2488#define _mm512_fmsub_round_ph(A, B, C, R) \
2489 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2490 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2491 (__mmask32)-1, (int)(R)))
2492
2493#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2494 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2495 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2496 (__mmask32)(U), (int)(R)))
2497
2498#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2499 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2500 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2501 (__mmask32)(U), (int)(R)))
2502
2503#define _mm512_fnmadd_round_ph(A, B, C, R) \
2504 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2505 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2506 (__mmask32)-1, (int)(R)))
2507
2508#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2509 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2510 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2511 (__mmask32)(U), (int)(R)))
2512
2513#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2514 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2515 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2516 (__mmask32)(U), (int)(R)))
2517
2518#define _mm512_fnmsub_round_ph(A, B, C, R) \
2519 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2520 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2521 (__mmask32)-1, (int)(R)))
2522
2523#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2524 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2525 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2526 (__mmask32)(U), (int)(R)))
2527
2528static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2529 __m512h __B,
2530 __m512h __C) {
2531 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2532 (__v32hf)__C, (__mmask32)-1,
2534}
2535
2536static __inline__ __m512h __DEFAULT_FN_ATTRS512
2537_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2538 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2539 (__v32hf)__C, (__mmask32)__U,
2541}
2542
2543static __inline__ __m512h __DEFAULT_FN_ATTRS512
2544_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2545 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2546 (__v32hf)__C, (__mmask32)__U,
2548}
2549
2550static __inline__ __m512h __DEFAULT_FN_ATTRS512
2551_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2552 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2553 (__v32hf)__C, (__mmask32)__U,
2555}
2556
2557static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2558 __m512h __B,
2559 __m512h __C) {
2560 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2561 -(__v32hf)__C, (__mmask32)-1,
2563}
2564
2565static __inline__ __m512h __DEFAULT_FN_ATTRS512
2566_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2567 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2568 -(__v32hf)__C, (__mmask32)__U,
2570}
2571
2572static __inline__ __m512h __DEFAULT_FN_ATTRS512
2573_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2574 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2575 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2577}
2578
2579static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2580 __m512h __B,
2581 __m512h __C) {
2582 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2583 (__v32hf)__C, (__mmask32)-1,
2585}
2586
2587static __inline__ __m512h __DEFAULT_FN_ATTRS512
2588_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2589 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2590 (__v32hf)__C, (__mmask32)__U,
2592}
2593
2594static __inline__ __m512h __DEFAULT_FN_ATTRS512
2595_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2596 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2597 (__v32hf)__C, (__mmask32)__U,
2599}
2600
2601static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2602 __m512h __B,
2603 __m512h __C) {
2604 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2605 -(__v32hf)__C, (__mmask32)-1,
2607}
2608
2609static __inline__ __m512h __DEFAULT_FN_ATTRS512
2610_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2611 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2612 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2614}
2615
2616#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2617 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2618 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2619 (__mmask32)-1, (int)(R)))
2620
2621#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2622 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2623 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2624 (__mmask32)(U), (int)(R)))
2625
2626#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2627 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2628 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2629 (__mmask32)(U), (int)(R)))
2630
2631#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2632 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2633 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2634 (__mmask32)(U), (int)(R)))
2635
2636#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2637 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2638 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2639 (__mmask32)-1, (int)(R)))
2640
2641#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2642 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2643 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2644 (__mmask32)(U), (int)(R)))
2645
2646#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2647 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2648 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2649 (__mmask32)(U), (int)(R)))
2650
2651static __inline__ __m512h __DEFAULT_FN_ATTRS512
2652_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2653 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2654 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2656}
2657
2658static __inline__ __m512h __DEFAULT_FN_ATTRS512
2659_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2660 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2661 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2663}
2664
2665static __inline__ __m512h __DEFAULT_FN_ATTRS512
2666_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2667 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2668 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2670}
2671
2672static __inline__ __m512h __DEFAULT_FN_ATTRS512
2673_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2674 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2675 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2677}
2678
2679static __inline__ __m512h __DEFAULT_FN_ATTRS512
2680_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2681 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2682 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2684}
2685
2686static __inline__ __m512h __DEFAULT_FN_ATTRS512
2687_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2688 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2689 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2691}
2692
2693static __inline__ __m512h __DEFAULT_FN_ATTRS512
2694_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2695 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2696 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2698}
2699
2700#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2701 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2702 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2704
2705static __inline__ __m512h __DEFAULT_FN_ATTRS512
2706_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2707 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2708 (__v32hf)__C, (__mmask32)__U,
2710}
2711
2712#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2713 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2714 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2716
2717static __inline__ __m512h __DEFAULT_FN_ATTRS512
2718_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2719 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2720 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2722}
2723
2724#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2725 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2726 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2727 (__mmask32)(U), (int)(R)))
2728
2729static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2731 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2732 (__v32hf)__C, (__mmask32)__U,
2734}
2735
2736#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2737 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2738 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2739 (__mmask32)(U), (int)(R)))
2740
2741#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2742 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2743 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2744 (__mmask32)(U), (int)(R)))
2745
2746static __inline__ __m512h __DEFAULT_FN_ATTRS512
2747_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2748 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2749 -(__v32hf)__C, (__mmask32)__U,
2751}
2752
2753static __inline__ __m512h __DEFAULT_FN_ATTRS512
2754_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2755 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2756 (__v32hf)__C, (__mmask32)__U,
2758}
2759
2760static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2761 __m128h __A,
2762 __m128h __B) {
2763 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2765}
2766
2767static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2768 __mmask8 __U,
2769 __m128h __A,
2770 __m128h __B) {
2771 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2773}
2774
2775#define _mm_fmadd_round_sh(A, B, C, R) \
2776 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2777 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2778 (__mmask8)-1, (int)(R)))
2779
2780#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2783 (__mmask8)(U), (int)(R)))
2784
2785static __inline__ __m128h __DEFAULT_FN_ATTRS128
2786_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2787 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2788 (__mmask8)__U,
2790}
2791
2792#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2793 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2794 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2795 (__mmask8)(U), (int)(R)))
2796
2797static __inline__ __m128h __DEFAULT_FN_ATTRS128
2798_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2799 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2800 (__mmask8)__U,
2802}
2803
2804#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2805 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2806 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2807 (__mmask8)(U), (int)(R)))
2808
2809static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2810 __m128h __A,
2811 __m128h __B) {
2812 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2813 -(__v8hf)__B, (__mmask8)-1,
2815}
2816
2817static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2818 __mmask8 __U,
2819 __m128h __A,
2820 __m128h __B) {
2821 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2822 -(__v8hf)__B, (__mmask8)__U,
2824}
2825
2826#define _mm_fmsub_round_sh(A, B, C, R) \
2827 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2828 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2829 (__mmask8)-1, (int)(R)))
2830
2831#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2832 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2834 (__mmask8)(U), (int)(R)))
2835
2836static __inline__ __m128h __DEFAULT_FN_ATTRS128
2837_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2838 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2839 -(__v8hf)__C, (__mmask8)__U,
2841}
2842
2843#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2844 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2845 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2846 (__mmask8)(U), (int)R))
2847
2848static __inline__ __m128h __DEFAULT_FN_ATTRS128
2849_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2850 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2851 (__mmask8)__U,
2853}
2854
2855#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2856 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2857 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2858 (__mmask8)(U), (int)(R)))
2859
2860static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2861 __m128h __A,
2862 __m128h __B) {
2863 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2865}
2866
2867static __inline__ __m128h __DEFAULT_FN_ATTRS128
2868_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2869 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2871}
2872
2873#define _mm_fnmadd_round_sh(A, B, C, R) \
2874 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2875 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2876 (__mmask8)-1, (int)(R)))
2877
2878#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2881 (__mmask8)(U), (int)(R)))
2882
2883static __inline__ __m128h __DEFAULT_FN_ATTRS128
2884_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2885 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2886 (__mmask8)__U,
2888}
2889
2890#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2891 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2892 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2893 (__mmask8)(U), (int)(R)))
2894
2895static __inline__ __m128h __DEFAULT_FN_ATTRS128
2896_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2897 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2898 (__mmask8)__U,
2900}
2901
2902#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2904 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2905 (__mmask8)(U), (int)(R)))
2906
2907static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2908 __m128h __A,
2909 __m128h __B) {
2910 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2912}
2913
2914static __inline__ __m128h __DEFAULT_FN_ATTRS128
2915_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2916 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2918}
2919
2920#define _mm_fnmsub_round_sh(A, B, C, R) \
2921 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2922 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2923 (__mmask8)-1, (int)(R)))
2924
2925#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2926 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2928 (__mmask8)(U), (int)(R)))
2929
2930static __inline__ __m128h __DEFAULT_FN_ATTRS128
2931_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2932 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2933 (__mmask8)__U,
2935}
2936
2937#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2938 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2939 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2940 (__mmask8)(U), (int)(R)))
2941
2942static __inline__ __m128h __DEFAULT_FN_ATTRS128
2943_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2944 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2945 (__mmask8)__U,
2947}
2948
2949#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2950 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2951 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2952 (__mmask8)(U), (int)(R)))
2953
2954static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2955 __m128h __B,
2956 __m128h __C) {
2957 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2958 (__v4sf)__C, (__mmask8)-1,
2960}
2961
2962static __inline__ __m128h __DEFAULT_FN_ATTRS128
2963_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2964 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2965 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2966}
2967
2968static __inline__ __m128h __DEFAULT_FN_ATTRS128
2969_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2970 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2971 (__v4sf)__C, (__mmask8)__U,
2973}
2974
2975static __inline__ __m128h __DEFAULT_FN_ATTRS128
2976_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2977 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2978 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2979}
2980
2981#define _mm_fcmadd_round_sch(A, B, C, R) \
2982 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2983 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2984 (__mmask8)-1, (int)(R)))
2985
2986#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2987 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2988 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2989 (__mmask8)(U), (int)(R)))
2990
2991#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2992 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2993 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2994 (__mmask8)(U), (int)(R)))
2995
2996#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2997 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2998 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2999 (__mmask8)(U), (int)(R)))
3000
3001static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
3002 __m128h __B,
3003 __m128h __C) {
3004 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
3005 (__v4sf)__C, (__mmask8)-1,
3007}
3008
3009static __inline__ __m128h __DEFAULT_FN_ATTRS128
3010_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
3011 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
3012 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
3013}
3014
3015static __inline__ __m128h __DEFAULT_FN_ATTRS128
3016_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3017 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3018 (__v4sf)__C, (__mmask8)__U,
3020}
3021
3022static __inline__ __m128h __DEFAULT_FN_ATTRS128
3023_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3024 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3025 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3026}
3027
3028#define _mm_fmadd_round_sch(A, B, C, R) \
3029 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3030 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3031 (__mmask8)-1, (int)(R)))
3032
3033#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3034 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3035 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3036 (__mmask8)(U), (int)(R)))
3037
3038#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3039 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3040 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3041 (__mmask8)(U), (int)(R)))
3042
3043#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3044 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3045 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3046 (__mmask8)(U), (int)(R)))
3047
3048static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3049 __m128h __B) {
3050 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3051 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3053}
3054
3055static __inline__ __m128h __DEFAULT_FN_ATTRS128
3056_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3057 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3058 (__v4sf)__W, (__mmask8)__U,
3060}
3061
3062static __inline__ __m128h __DEFAULT_FN_ATTRS128
3063_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3064 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3065 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3067}
3068
3069#define _mm_fcmul_round_sch(A, B, R) \
3070 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3071 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3072 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3073
3074#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3075 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3076 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3077 (__mmask8)(U), (int)(R)))
3078
3079#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3080 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3081 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3082 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3083
3084static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3085 __m128h __B) {
3086 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3087 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3089}
3090
3091static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3092 __mmask8 __U,
3093 __m128h __A,
3094 __m128h __B) {
3095 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3096 (__v4sf)__W, (__mmask8)__U,
3098}
3099
3100static __inline__ __m128h __DEFAULT_FN_ATTRS128
3101_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3102 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3103 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3105}
3106
3107#define _mm_fmul_round_sch(A, B, R) \
3108 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3109 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3110 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3111
3112#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3113 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3114 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3115 (__mmask8)(U), (int)(R)))
3116
3117#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3118 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3119 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3120 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3121
3122static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3123 __m512h __B) {
3124 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3125 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3127}
3128
3129static __inline__ __m512h __DEFAULT_FN_ATTRS512
3130_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3131 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3132 (__v16sf)__W, (__mmask16)__U,
3134}
3135
3136static __inline__ __m512h __DEFAULT_FN_ATTRS512
3137_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3138 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3139 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3141}
3142
3143#define _mm512_fcmul_round_pch(A, B, R) \
3144 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3145 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3146 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3147
3148#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3149 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3150 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3151 (__mmask16)(U), (int)(R)))
3152
3153#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3154 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3155 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3156 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3157
3158static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3159 __m512h __B) {
3160 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3161 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3163}
3164
3165static __inline__ __m512h __DEFAULT_FN_ATTRS512
3166_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3167 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3168 (__v16sf)__W, (__mmask16)__U,
3170}
3171
3172static __inline__ __m512h __DEFAULT_FN_ATTRS512
3173_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3174 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3175 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3177}
3178
3179#define _mm512_fmul_round_pch(A, B, R) \
3180 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3181 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3182 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3183
3184#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3185 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3186 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3187 (__mmask16)(U), (int)(R)))
3188
3189#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3190 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3191 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3192 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3193
3194static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3195 __m512h __B,
3196 __m512h __C) {
3197 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3198 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3200}
3201
3202static __inline__ __m512h __DEFAULT_FN_ATTRS512
3203_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3204 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3205 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3207}
3208
3209static __inline__ __m512h __DEFAULT_FN_ATTRS512
3210_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3211 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3212 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3214}
3215
3216static __inline__ __m512h __DEFAULT_FN_ATTRS512
3217_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3218 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3219 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3221}
3222
3223#define _mm512_fcmadd_round_pch(A, B, C, R) \
3224 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3225 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3226 (__mmask16)-1, (int)(R)))
3227
3228#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3229 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3230 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3231 (__mmask16)(U), (int)(R)))
3232
3233#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3234 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3235 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3236 (__mmask16)(U), (int)(R)))
3237
3238#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3239 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3240 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3241 (__mmask16)(U), (int)(R)))
3242
3243static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3244 __m512h __B,
3245 __m512h __C) {
3246 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3247 (__v16sf)__C, (__mmask16)-1,
3249}
3250
3251static __inline__ __m512h __DEFAULT_FN_ATTRS512
3252_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3253 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3254 (__v16sf)__C, (__mmask16)__U,
3256}
3257
3258static __inline__ __m512h __DEFAULT_FN_ATTRS512
3259_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3260 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3261 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3263}
3264
3265static __inline__ __m512h __DEFAULT_FN_ATTRS512
3266_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3267 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3268 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3270}
3271
3272#define _mm512_fmadd_round_pch(A, B, C, R) \
3273 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3274 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3275 (__mmask16)-1, (int)(R)))
3276
3277#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3278 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3279 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3280 (__mmask16)(U), (int)(R)))
3281
3282#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3283 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3284 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3285 (__mmask16)(U), (int)(R)))
3286
3287#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3288 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3289 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3290 (__mmask16)(U), (int)(R)))
3291
3292static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3293_mm512_reduce_add_ph(__m512h __W) {
3294 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3295}
3296
3297static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3298_mm512_reduce_mul_ph(__m512h __W) {
3299 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3300}
3301
3302static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3303_mm512_reduce_max_ph(__m512h __V) {
3304 return __builtin_ia32_reduce_fmax_ph512(__V);
3305}
3306
3307static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3308_mm512_reduce_min_ph(__m512h __V) {
3309 return __builtin_ia32_reduce_fmin_ph512(__V);
3310}
3311
3312static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3313_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3314 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3315 (__v32hf)__A);
3316}
3317
3318static __inline__ __m512h __DEFAULT_FN_ATTRS512
3319_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3320 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3321 (__v32hi)__B);
3322}
3323
3324static __inline__ __m512h __DEFAULT_FN_ATTRS512
3325_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3326 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3327}
3328
3329// intrinsics below are alias for f*mul_*ch
3330#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3331#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3332#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3333#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3334#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3335 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3336#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3337 _mm512_maskz_fmul_round_pch(U, A, B, R)
3338
3339#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3340#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3341#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3342#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3343#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3344 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3345#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3346 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3347
3348#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3349#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3350#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3351#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3352#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3353 _mm_mask_fmul_round_sch(W, U, A, B, R)
3354#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3355
3356#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3357#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3358#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3359#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3360#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3361 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3362#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3363 _mm_maskz_fcmul_round_sch(U, A, B, R)
3364
3365#undef __DEFAULT_FN_ATTRS128
3366#undef __DEFAULT_FN_ATTRS256
3367#undef __DEFAULT_FN_ATTRS512
3368#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3369#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3370#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
3371
3372#endif
3373#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ _Float16
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
unsigned char __mmask8
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
unsigned short __mmask16
#define _MM_FROUND_CUR_DIRECTION
#define _CMP_GT_OS
Definition avxintrin.h:1590
#define _CMP_GE_OS
Definition avxintrin.h:1589
#define _CMP_GT_OQ
Definition avxintrin.h:1606
#define _CMP_LE_OQ
Definition avxintrin.h:1594
#define _CMP_LT_OQ
Definition avxintrin.h:1593
#define _CMP_NEQ_US
Definition avxintrin.h:1596
#define _CMP_EQ_OS
Definition avxintrin.h:1592
#define _CMP_GE_OQ
Definition avxintrin.h:1605
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1765
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1899
#define _CMP_LE_OS
Definition xmmintrin.h:3029
#define _CMP_NEQ_UQ
Definition xmmintrin.h:3031
#define _CMP_LT_OS
Definition xmmintrin.h:3028
#define _CMP_EQ_OQ
Definition xmmintrin.h:3027
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021