clang 22.0.0git
avx512fp16intrin.h
Go to the documentation of this file.
1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23/* Define the default attributes for the functions in this file. */
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
33
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
38#else
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
42#endif
43
45_mm512_cvtsh_h(__m512h __a) {
46 return __a[0];
47}
48
49static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setzero_ph(void) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
51}
52
53static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
54_mm256_setzero_ph(void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57}
58
59static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
60 return (__m256h)__builtin_ia32_undef256();
61}
62
63static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
64_mm512_setzero_ph(void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
68}
69
70static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
71 return (__m128h)__builtin_ia32_undef128();
72}
73
74static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
75 return (__m512h)__builtin_ia32_undef512();
76}
77
78static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
79_mm512_set1_ph(_Float16 __h) {
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
84}
85
86static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
87_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
88 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
89 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
90 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
91 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
92 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
93 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
94 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
100}
101
102static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setr_ph(
103 _Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4,
104 _Float16 e5, _Float16 e6, _Float16 e7, _Float16 e8, _Float16 e9,
105 _Float16 e10, _Float16 e11, _Float16 e12, _Float16 e13, _Float16 e14,
106 _Float16 e15, _Float16 e16, _Float16 e17, _Float16 e18, _Float16 e19,
107 _Float16 e20, _Float16 e21, _Float16 e22, _Float16 e23, _Float16 e24,
108 _Float16 e25, _Float16 e26, _Float16 e27, _Float16 e28, _Float16 e29,
109 _Float16 e30, _Float16 e31) {
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
113}
114
115static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
116_mm512_set1_pch(_Float16 _Complex __h) {
117 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
118}
119
120static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
121 return (__m128)__a;
122}
123
124static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
125 return (__m256)__a;
126}
127
128static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
129 return (__m512)__a;
130}
131
132static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
133 return (__m128d)__a;
134}
135
136static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
137 return (__m256d)__a;
138}
139
140static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
141 return (__m512d)__a;
142}
143
144static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
145 return (__m128i)__a;
146}
147
148static __inline__ __m256i __DEFAULT_FN_ATTRS256
149_mm256_castph_si256(__m256h __a) {
150 return (__m256i)__a;
151}
152
153static __inline__ __m512i __DEFAULT_FN_ATTRS512
154_mm512_castph_si512(__m512h __a) {
155 return (__m512i)__a;
156}
157
158static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
159 return (__m128h)__a;
160}
161
162static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
163 return (__m256h)__a;
164}
165
166static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
167 return (__m512h)__a;
168}
169
170static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
171 return (__m128h)__a;
172}
173
174static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
175 return (__m256h)__a;
176}
177
178static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
179 return (__m512h)__a;
180}
181
182static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
183 return (__m128h)__a;
184}
185
186static __inline__ __m256h __DEFAULT_FN_ATTRS256
187_mm256_castsi256_ph(__m256i __a) {
188 return (__m256h)__a;
189}
190
191static __inline__ __m512h __DEFAULT_FN_ATTRS512
192_mm512_castsi512_ph(__m512i __a) {
193 return (__m512h)__a;
194}
195
196static __inline__ __m128h __DEFAULT_FN_ATTRS256_CONSTEXPR
197_mm256_castph256_ph128(__m256h __a) {
198 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
199}
200
201static __inline__ __m128h __DEFAULT_FN_ATTRS512_CONSTEXPR
202_mm512_castph512_ph128(__m512h __a) {
203 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
204}
205
206static __inline__ __m256h __DEFAULT_FN_ATTRS512_CONSTEXPR
207_mm512_castph512_ph256(__m512h __a) {
208 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209 12, 13, 14, 15);
210}
211
212static __inline__ __m256h __DEFAULT_FN_ATTRS256
213_mm256_castph128_ph256(__m128h __a) {
214 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
216}
217
218static __inline__ __m512h __DEFAULT_FN_ATTRS512
219_mm512_castph128_ph512(__m128h __a) {
220 __m256h __b = __builtin_nondeterministic_value(__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
226}
227
228static __inline__ __m512h __DEFAULT_FN_ATTRS512
229_mm512_castph256_ph512(__m256h __a) {
230 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
233 27, 28, 29, 30, 31);
234}
235
236/// Constructs a 256-bit floating-point vector of [16 x half] from a
237/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
238/// contain the value of the source vector. The upper 384 bits are set
239/// to zero.
240///
241/// \headerfile <x86intrin.h>
242///
243/// This intrinsic has no corresponding instruction.
244///
245/// \param __a
246/// A 128-bit vector of [8 x half].
247/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
248/// contain the value of the parameter. The upper 384 bits are set to zero.
249static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
250_mm256_zextph128_ph256(__m128h __a) {
251 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
253}
254
255/// Constructs a 512-bit floating-point vector of [32 x half] from a
256/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
257/// contain the value of the source vector. The upper 384 bits are set
258/// to zero.
259///
260/// \headerfile <x86intrin.h>
261///
262/// This intrinsic has no corresponding instruction.
263///
264/// \param __a
265/// A 128-bit vector of [8 x half].
266/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
267/// contain the value of the parameter. The upper 384 bits are set to zero.
268static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
269_mm512_zextph128_ph512(__m128h __a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
273}
274
275/// Constructs a 512-bit floating-point vector of [32 x half] from a
276/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
277/// contain the value of the source vector. The upper 256 bits are set
278/// to zero.
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic has no corresponding instruction.
283///
284/// \param __a
285/// A 256-bit vector of [16 x half].
286/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
287/// contain the value of the parameter. The upper 256 bits are set to zero.
288static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
289_mm512_zextph256_ph512(__m256h __a) {
290 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
293 29, 30, 31);
294}
295
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
298
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
301
302static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
303 __m128h __B) {
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
306}
307
308static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
309 __m128h __B) {
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
312}
313
314static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
315 __m128h __B) {
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
318}
319
320static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
321 __m128h __B) {
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
324}
325
326static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
327 __m128h __B) {
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
330}
331
332static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
333 __m128h __B) {
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
336}
337
338static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
339 __m128h __B) {
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
342}
343
344static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
345 __m128h __B) {
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
348}
349
350static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
351 __m128h __B) {
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
354}
355
356static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
357 __m128h __B) {
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
360}
361
362static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
363 __m128h __B) {
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
366}
367
368static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
369 __m128h __B) {
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
372}
373
374static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
375 __m512h __B) {
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
377}
378
379static __inline__ __m512h __DEFAULT_FN_ATTRS512
380_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
383}
384
385static __inline__ __m512h __DEFAULT_FN_ATTRS512
386_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
390}
391
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
395
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
400
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
405
406static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
407 __m512h __B) {
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
409}
410
411static __inline__ __m512h __DEFAULT_FN_ATTRS512
412_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
415}
416
417static __inline__ __m512h __DEFAULT_FN_ATTRS512
418_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
422}
423
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
427
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
432
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
437
438static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
439 __m512h __B) {
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
441}
442
443static __inline__ __m512h __DEFAULT_FN_ATTRS512
444_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
447}
448
449static __inline__ __m512h __DEFAULT_FN_ATTRS512
450_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
454}
455
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
459
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
464
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
469
470static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
471 __m512h __B) {
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
473}
474
475static __inline__ __m512h __DEFAULT_FN_ATTRS512
476_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
479}
480
481static __inline__ __m512h __DEFAULT_FN_ATTRS512
482_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
486}
487
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
491
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
496
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
501
502static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
503 __m512h __B) {
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
506}
507
508static __inline__ __m512h __DEFAULT_FN_ATTRS512
509_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
512}
513
514static __inline__ __m512h __DEFAULT_FN_ATTRS512
515_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
519}
520
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
524
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
529
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
534
535static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
536 __m512h __B) {
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
539}
540
541static __inline__ __m512h __DEFAULT_FN_ATTRS512
542_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
545}
546
547static __inline__ __m512h __DEFAULT_FN_ATTRS512
548_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
552}
553
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
557
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
562
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
567
568static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
569_mm512_abs_ph(__m512h __A) {
570 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
571}
572
573static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
574 return (__m512h)_mm512_xor_epi32((__m512i)__A,
575 _mm512_set1_epi32(-2147483648));
576}
577
578static __inline__ __m512h __DEFAULT_FN_ATTRS512
579_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
582}
583
584static __inline__ __m512h __DEFAULT_FN_ATTRS512
585_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
588 (__v16sf)_mm512_setzero_ps());
589}
590
591static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
592 __m128h __B) {
593 __A[0] += __B[0];
594 return __A;
595}
596
597static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
598 __mmask8 __U,
599 __m128h __A,
600 __m128h __B) {
601 __A = _mm_add_sh(__A, __B);
602 return __builtin_ia32_selectsh_128(__U, __A, __W);
603}
604
605static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
606 __m128h __A,
607 __m128h __B) {
608 __A = _mm_add_sh(__A, __B);
609 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
610}
611
612#define _mm_add_round_sh(A, B, R) \
613 ((__m128h)__builtin_ia32_addsh_round_mask( \
614 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
615 (__mmask8)-1, (int)(R)))
616
617#define _mm_mask_add_round_sh(W, U, A, B, R) \
618 ((__m128h)__builtin_ia32_addsh_round_mask( \
619 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
620 (__mmask8)(U), (int)(R)))
621
622#define _mm_maskz_add_round_sh(U, A, B, R) \
623 ((__m128h)__builtin_ia32_addsh_round_mask( \
624 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
625 (__mmask8)(U), (int)(R)))
626
627static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
628 __m128h __B) {
629 __A[0] -= __B[0];
630 return __A;
631}
632
633static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
634 __mmask8 __U,
635 __m128h __A,
636 __m128h __B) {
637 __A = _mm_sub_sh(__A, __B);
638 return __builtin_ia32_selectsh_128(__U, __A, __W);
639}
640
641static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
642 __m128h __A,
643 __m128h __B) {
644 __A = _mm_sub_sh(__A, __B);
645 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
646}
647
648#define _mm_sub_round_sh(A, B, R) \
649 ((__m128h)__builtin_ia32_subsh_round_mask( \
650 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
651 (__mmask8)-1, (int)(R)))
652
653#define _mm_mask_sub_round_sh(W, U, A, B, R) \
654 ((__m128h)__builtin_ia32_subsh_round_mask( \
655 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
656 (__mmask8)(U), (int)(R)))
657
658#define _mm_maskz_sub_round_sh(U, A, B, R) \
659 ((__m128h)__builtin_ia32_subsh_round_mask( \
660 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
661 (__mmask8)(U), (int)(R)))
662
663static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
664 __m128h __B) {
665 __A[0] *= __B[0];
666 return __A;
667}
668
669static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
670 __mmask8 __U,
671 __m128h __A,
672 __m128h __B) {
673 __A = _mm_mul_sh(__A, __B);
674 return __builtin_ia32_selectsh_128(__U, __A, __W);
675}
676
677static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
678 __m128h __A,
679 __m128h __B) {
680 __A = _mm_mul_sh(__A, __B);
681 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
682}
683
684#define _mm_mul_round_sh(A, B, R) \
685 ((__m128h)__builtin_ia32_mulsh_round_mask( \
686 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
687 (__mmask8)-1, (int)(R)))
688
689#define _mm_mask_mul_round_sh(W, U, A, B, R) \
690 ((__m128h)__builtin_ia32_mulsh_round_mask( \
691 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
692 (__mmask8)(U), (int)(R)))
693
694#define _mm_maskz_mul_round_sh(U, A, B, R) \
695 ((__m128h)__builtin_ia32_mulsh_round_mask( \
696 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
697 (__mmask8)(U), (int)(R)))
698
699static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
700 __m128h __B) {
701 __A[0] /= __B[0];
702 return __A;
703}
704
705static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
706 __mmask8 __U,
707 __m128h __A,
708 __m128h __B) {
709 __A = _mm_div_sh(__A, __B);
710 return __builtin_ia32_selectsh_128(__U, __A, __W);
711}
712
713static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
714 __m128h __A,
715 __m128h __B) {
716 __A = _mm_div_sh(__A, __B);
717 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
718}
719
720#define _mm_div_round_sh(A, B, R) \
721 ((__m128h)__builtin_ia32_divsh_round_mask( \
722 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
723 (__mmask8)-1, (int)(R)))
724
725#define _mm_mask_div_round_sh(W, U, A, B, R) \
726 ((__m128h)__builtin_ia32_divsh_round_mask( \
727 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
728 (__mmask8)(U), (int)(R)))
729
730#define _mm_maskz_div_round_sh(U, A, B, R) \
731 ((__m128h)__builtin_ia32_divsh_round_mask( \
732 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
733 (__mmask8)(U), (int)(R)))
734
735static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
736 __m128h __B) {
737 return (__m128h)__builtin_ia32_minsh_round_mask(
738 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
740}
741
742static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
743 __mmask8 __U,
744 __m128h __A,
745 __m128h __B) {
746 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
747 (__v8hf)__W, (__mmask8)__U,
749}
750
751static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
752 __m128h __A,
753 __m128h __B) {
754 return (__m128h)__builtin_ia32_minsh_round_mask(
755 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
757}
758
759#define _mm_min_round_sh(A, B, R) \
760 ((__m128h)__builtin_ia32_minsh_round_mask( \
761 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
762 (__mmask8)-1, (int)(R)))
763
764#define _mm_mask_min_round_sh(W, U, A, B, R) \
765 ((__m128h)__builtin_ia32_minsh_round_mask( \
766 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
767 (__mmask8)(U), (int)(R)))
768
769#define _mm_maskz_min_round_sh(U, A, B, R) \
770 ((__m128h)__builtin_ia32_minsh_round_mask( \
771 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
772 (__mmask8)(U), (int)(R)))
773
774static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
775 __m128h __B) {
776 return (__m128h)__builtin_ia32_maxsh_round_mask(
777 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
779}
780
781static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
782 __mmask8 __U,
783 __m128h __A,
784 __m128h __B) {
785 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
786 (__v8hf)__W, (__mmask8)__U,
788}
789
790static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
791 __m128h __A,
792 __m128h __B) {
793 return (__m128h)__builtin_ia32_maxsh_round_mask(
794 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
796}
797
798#define _mm_max_round_sh(A, B, R) \
799 ((__m128h)__builtin_ia32_maxsh_round_mask( \
800 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
801 (__mmask8)-1, (int)(R)))
802
803#define _mm_mask_max_round_sh(W, U, A, B, R) \
804 ((__m128h)__builtin_ia32_maxsh_round_mask( \
805 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
806 (__mmask8)(U), (int)(R)))
807
808#define _mm_maskz_max_round_sh(U, A, B, R) \
809 ((__m128h)__builtin_ia32_maxsh_round_mask( \
810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
811 (__mmask8)(U), (int)(R)))
812
813#define _mm512_cmp_round_ph_mask(A, B, P, R) \
814 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
815 (__v32hf)(__m512h)(B), (int)(P), \
816 (__mmask32)-1, (int)(R)))
817
818#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
819 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
820 (__v32hf)(__m512h)(B), (int)(P), \
821 (__mmask32)(U), (int)(R)))
822
823#define _mm512_cmp_ph_mask(A, B, P) \
824 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
825
826#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
827 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
828
829#define _mm_cmp_round_sh_mask(X, Y, P, R) \
830 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
831 (__v8hf)(__m128h)(Y), (int)(P), \
832 (__mmask8)-1, (int)(R)))
833
834#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
835 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
836 (__v8hf)(__m128h)(Y), (int)(P), \
837 (__mmask8)(M), (int)(R)))
838
839#define _mm_cmp_sh_mask(X, Y, P) \
840 ((__mmask8)__builtin_ia32_cmpsh_mask( \
841 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
842 _MM_FROUND_CUR_DIRECTION))
843
844#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
845 ((__mmask8)__builtin_ia32_cmpsh_mask( \
846 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
847 _MM_FROUND_CUR_DIRECTION))
848// loads with vmovsh:
849static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
850 struct __mm_load_sh_struct {
851 _Float16 __u;
852 } __attribute__((__packed__, __may_alias__));
853 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
854 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
855}
856
857static __inline__ __m128h __DEFAULT_FN_ATTRS128
858_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
859 __m128h src = (__v8hf)__builtin_shufflevector(
860 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
861
862 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
863}
864
865static __inline__ __m128h __DEFAULT_FN_ATTRS128
866_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
867 return (__m128h)__builtin_ia32_loadsh128_mask(
868 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
869}
870
871static __inline__ __m512h __DEFAULT_FN_ATTRS512
872_mm512_load_ph(void const *__p) {
873 return *(const __m512h *)__p;
874}
875
876static __inline__ __m256h __DEFAULT_FN_ATTRS256
877_mm256_load_ph(void const *__p) {
878 return *(const __m256h *)__p;
879}
880
881static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
882 return *(const __m128h *)__p;
883}
884
885static __inline__ __m512h __DEFAULT_FN_ATTRS512
886_mm512_loadu_ph(void const *__p) {
887 struct __loadu_ph {
888 __m512h_u __v;
889 } __attribute__((__packed__, __may_alias__));
890 return ((const struct __loadu_ph *)__p)->__v;
891}
892
893static __inline__ __m256h __DEFAULT_FN_ATTRS256
894_mm256_loadu_ph(void const *__p) {
895 struct __loadu_ph {
896 __m256h_u __v;
897 } __attribute__((__packed__, __may_alias__));
898 return ((const struct __loadu_ph *)__p)->__v;
899}
900
901static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
902 struct __loadu_ph {
903 __m128h_u __v;
904 } __attribute__((__packed__, __may_alias__));
905 return ((const struct __loadu_ph *)__p)->__v;
906}
907
908// stores with vmovsh:
909static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
910 __m128h __a) {
911 struct __mm_store_sh_struct {
912 _Float16 __u;
913 } __attribute__((__packed__, __may_alias__));
914 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
915}
916
917static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
918 __mmask8 __U,
919 __m128h __A) {
920 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
921}
922
923static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
924 __m512h __A) {
925 *(__m512h *)__P = __A;
926}
927
928static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
929 __m256h __A) {
930 *(__m256h *)__P = __A;
931}
932
933static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
934 __m128h __A) {
935 *(__m128h *)__P = __A;
936}
937
938static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
939 __m512h __A) {
940 struct __storeu_ph {
941 __m512h_u __v;
942 } __attribute__((__packed__, __may_alias__));
943 ((struct __storeu_ph *)__P)->__v = __A;
944}
945
946static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
947 __m256h __A) {
948 struct __storeu_ph {
949 __m256h_u __v;
950 } __attribute__((__packed__, __may_alias__));
951 ((struct __storeu_ph *)__P)->__v = __A;
952}
953
954static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
955 __m128h __A) {
956 struct __storeu_ph {
957 __m128h_u __v;
958 } __attribute__((__packed__, __may_alias__));
959 ((struct __storeu_ph *)__P)->__v = __A;
960}
961
962// moves with vmovsh:
963static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
964 __m128h __b) {
965 __a[0] = __b[0];
966 return __a;
967}
968
969static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
970 __mmask8 __U,
971 __m128h __A,
972 __m128h __B) {
973 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
974}
975
976static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
977 __m128h __A,
978 __m128h __B) {
979 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
980 _mm_setzero_ph());
981}
982
983// vmovw:
984static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
985 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
986}
987
988static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
989 __v8hi __b = (__v8hi)__a;
990 return __b[0];
991}
992
993static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
994 return (__m512h)__builtin_ia32_rcpph512_mask(
995 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
996}
997
998static __inline__ __m512h __DEFAULT_FN_ATTRS512
999_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1000 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
1001 (__mmask32)__U);
1002}
1003
1004static __inline__ __m512h __DEFAULT_FN_ATTRS512
1005_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
1006 return (__m512h)__builtin_ia32_rcpph512_mask(
1007 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1008}
1009
1010static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
1011 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1012 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
1013}
1014
1015static __inline__ __m512h __DEFAULT_FN_ATTRS512
1016_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1017 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1018 (__mmask32)__U);
1019}
1020
1021static __inline__ __m512h __DEFAULT_FN_ATTRS512
1022_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1023 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1024 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1025}
1026
1027#define _mm512_getmant_ph(A, B, C) \
1028 ((__m512h)__builtin_ia32_getmantph512_mask( \
1029 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1030 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1031 _MM_FROUND_CUR_DIRECTION))
1032
1033#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1034 ((__m512h)__builtin_ia32_getmantph512_mask( \
1035 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1036 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1037
1038#define _mm512_maskz_getmant_ph(U, A, B, C) \
1039 ((__m512h)__builtin_ia32_getmantph512_mask( \
1040 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1041 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1042
1043#define _mm512_getmant_round_ph(A, B, C, R) \
1044 ((__m512h)__builtin_ia32_getmantph512_mask( \
1045 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1046 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1047
1048#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1049 ((__m512h)__builtin_ia32_getmantph512_mask( \
1050 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1051 (__mmask32)(U), (int)(R)))
1052
1053#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1054 ((__m512h)__builtin_ia32_getmantph512_mask( \
1055 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1056 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1057
1058static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1059 return (__m512h)__builtin_ia32_getexpph512_mask(
1060 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1062}
1063
1064static __inline__ __m512h __DEFAULT_FN_ATTRS512
1065_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1066 return (__m512h)__builtin_ia32_getexpph512_mask(
1067 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1068}
1069
1070static __inline__ __m512h __DEFAULT_FN_ATTRS512
1071_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1072 return (__m512h)__builtin_ia32_getexpph512_mask(
1073 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1075}
1076
1077#define _mm512_getexp_round_ph(A, R) \
1078 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1079 (__v32hf)_mm512_undefined_ph(), \
1080 (__mmask32)-1, (int)(R)))
1081
1082#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1083 ((__m512h)__builtin_ia32_getexpph512_mask( \
1084 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1085
1086#define _mm512_maskz_getexp_round_ph(U, A, R) \
1087 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1088 (__v32hf)_mm512_setzero_ph(), \
1089 (__mmask32)(U), (int)(R)))
1090
1091static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1092 __m512h __B) {
1093 return (__m512h)__builtin_ia32_scalefph512_mask(
1094 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1096}
1097
1098static __inline__ __m512h __DEFAULT_FN_ATTRS512
1099_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1100 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1101 (__v32hf)__W, (__mmask32)__U,
1103}
1104
1105static __inline__ __m512h __DEFAULT_FN_ATTRS512
1106_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1107 return (__m512h)__builtin_ia32_scalefph512_mask(
1108 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1110}
1111
1112#define _mm512_scalef_round_ph(A, B, R) \
1113 ((__m512h)__builtin_ia32_scalefph512_mask( \
1114 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1115 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1116
1117#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1118 ((__m512h)__builtin_ia32_scalefph512_mask( \
1119 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1120 (__mmask32)(U), (int)(R)))
1121
1122#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1123 ((__m512h)__builtin_ia32_scalefph512_mask( \
1124 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1125 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1126
1127#define _mm512_roundscale_ph(A, B) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1129 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1130 _MM_FROUND_CUR_DIRECTION))
1131
1132#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1133 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1134 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1135 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1136
1137#define _mm512_maskz_roundscale_ph(A, B, imm) \
1138 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1139 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1140 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1141
1142#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1143 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1144 (__v32hf)(__m512h)(A), \
1145 (__mmask32)(B), (int)(R)))
1146
1147#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1148 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1149 (__v32hf)_mm512_setzero_ph(), \
1150 (__mmask32)(A), (int)(R)))
1151
1152#define _mm512_roundscale_round_ph(A, imm, R) \
1153 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154 (__v32hf)_mm512_undefined_ph(), \
1155 (__mmask32)-1, (int)(R)))
1156
1157#define _mm512_reduce_ph(A, imm) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask( \
1159 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1160 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1161
1162#define _mm512_mask_reduce_ph(W, U, A, imm) \
1163 ((__m512h)__builtin_ia32_reduceph512_mask( \
1164 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1165 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1166
1167#define _mm512_maskz_reduce_ph(U, A, imm) \
1168 ((__m512h)__builtin_ia32_reduceph512_mask( \
1169 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1170 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1171
1172#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1173 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1174 (__v32hf)(__m512h)(W), \
1175 (__mmask32)(U), (int)(R)))
1176
1177#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1178 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1179 (__v32hf)_mm512_setzero_ph(), \
1180 (__mmask32)(U), (int)(R)))
1181
1182#define _mm512_reduce_round_ph(A, imm, R) \
1183 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1184 (__v32hf)_mm512_undefined_ph(), \
1185 (__mmask32)-1, (int)(R)))
1186
1187static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1188 __m128h __B) {
1189 return (__m128h)__builtin_ia32_rcpsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1191}
1192
1193static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1194 __mmask8 __U,
1195 __m128h __A,
1196 __m128h __B) {
1197 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1198 (__v8hf)__W, (__mmask8)__U);
1199}
1200
1201static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1202 __m128h __A,
1203 __m128h __B) {
1204 return (__m128h)__builtin_ia32_rcpsh_mask(
1205 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1206}
1207
1208static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1209 __m128h __B) {
1210 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1211 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1212}
1213
1214static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1215 __mmask8 __U,
1216 __m128h __A,
1217 __m128h __B) {
1218 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1219 (__v8hf)__W, (__mmask8)__U);
1220}
1221
1222static __inline__ __m128h __DEFAULT_FN_ATTRS128
1223_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1224 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1225 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1226}
1227
1228#define _mm_getmant_round_sh(A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1232
1233#define _mm_getmant_sh(A, B, C, D) \
1234 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1236 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1237
1238#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1239 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1240 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1241 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1242
1243#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1244 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1245 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1246 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1247
1248#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1249 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1250 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1251 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1252
1253#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1254 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1255 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1256 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1257
1258#define _mm_getexp_round_sh(A, B, R) \
1259 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1260 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1261 (__mmask8)-1, (int)(R)))
1262
1263static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1264 __m128h __B) {
1265 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1266 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1268}
1269
1270static __inline__ __m128h __DEFAULT_FN_ATTRS128
1271_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1272 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1273 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1275}
1276
1277#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1278 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1279 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1280 (__mmask8)(U), (int)(R)))
1281
1282static __inline__ __m128h __DEFAULT_FN_ATTRS128
1283_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1284 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1285 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1287}
1288
1289#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1290 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1291 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1292 (__mmask8)(U), (int)(R)))
1293
1294#define _mm_scalef_round_sh(A, B, R) \
1295 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1296 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1297 (__mmask8)-1, (int)(R)))
1298
1299static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1300 __m128h __B) {
1301 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1302 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1304}
1305
1306static __inline__ __m128h __DEFAULT_FN_ATTRS128
1307_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1308 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1309 (__v8hf)__W, (__mmask8)__U,
1311}
1312
1313#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1314 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1315 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1316 (__mmask8)(U), (int)(R)))
1317
1318static __inline__ __m128h __DEFAULT_FN_ATTRS128
1319_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1320 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1321 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1323}
1324
1325#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1326 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1328 (__mmask8)(U), (int)(R)))
1329
1330#define _mm_roundscale_round_sh(A, B, imm, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1333 (__mmask8)-1, (int)(imm), (int)(R)))
1334
1335#define _mm_roundscale_sh(A, B, imm) \
1336 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1339
1340#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1341 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1343 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1344
1345#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1346 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1348 (__mmask8)(U), (int)(I), (int)(R)))
1349
1350#define _mm_maskz_roundscale_sh(U, A, B, I) \
1351 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1353 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1354
1355#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1356 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1358 (__mmask8)(U), (int)(I), (int)(R)))
1359
1360#define _mm_reduce_sh(A, B, C) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1364
1365#define _mm_mask_reduce_sh(W, U, A, B, C) \
1366 ((__m128h)__builtin_ia32_reducesh_mask( \
1367 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1368 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1369
1370#define _mm_maskz_reduce_sh(U, A, B, C) \
1371 ((__m128h)__builtin_ia32_reducesh_mask( \
1372 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1373 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1374
1375#define _mm_reduce_round_sh(A, B, C, R) \
1376 ((__m128h)__builtin_ia32_reducesh_mask( \
1377 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1378 (__mmask8)-1, (int)(C), (int)(R)))
1379
1380#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1381 ((__m128h)__builtin_ia32_reducesh_mask( \
1382 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1383 (__mmask8)(U), (int)(C), (int)(R)))
1384
1385#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1386 ((__m128h)__builtin_ia32_reducesh_mask( \
1387 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1388 (__mmask8)(U), (int)(C), (int)(R)))
1389
1390#define _mm512_sqrt_round_ph(A, R) \
1391 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1392
1393#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1394 ((__m512h)__builtin_ia32_selectph_512( \
1395 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1396 (__v32hf)(__m512h)(W)))
1397
1398#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1399 ((__m512h)__builtin_ia32_selectph_512( \
1400 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1401 (__v32hf)_mm512_setzero_ph()))
1402
1403static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1404 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1406}
1407
1408static __inline__ __m512h __DEFAULT_FN_ATTRS512
1409_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1410 return (__m512h)__builtin_ia32_selectph_512(
1411 (__mmask32)(__U),
1412 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1413 (__v32hf)(__m512h)(__W));
1414}
1415
1416static __inline__ __m512h __DEFAULT_FN_ATTRS512
1417_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1418 return (__m512h)__builtin_ia32_selectph_512(
1419 (__mmask32)(__U),
1420 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1421 (__v32hf)_mm512_setzero_ph());
1422}
1423
1424#define _mm_sqrt_round_sh(A, B, R) \
1425 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1426 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1427 (__mmask8)-1, (int)(R)))
1428
1429#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1430 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1431 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1432 (__mmask8)(U), (int)(R)))
1433
1434#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1435 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1436 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1437 (__mmask8)(U), (int)(R)))
1438
1439static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1440 __m128h __B) {
1441 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1442 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1444}
1445
1446static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1447 __mmask32 __U,
1448 __m128h __A,
1449 __m128h __B) {
1450 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1451 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1453}
1454
1455static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1456 __m128h __A,
1457 __m128h __B) {
1458 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1459 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1461}
1462
1463#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1464 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1465 (int)(imm), (__mmask32)(U)))
1466
1467#define _mm512_fpclass_ph_mask(A, imm) \
1468 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1469 (int)(imm), (__mmask32)-1))
1470
1471#define _mm_fpclass_sh_mask(A, imm) \
1472 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1473 (__mmask8)-1))
1474
1475#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1476 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1477 (__mmask8)(U)))
1478
1479#define _mm512_cvt_roundpd_ph(A, R) \
1480 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1481 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1482
1483#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1484 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1485 (__mmask8)(U), (int)(R)))
1486
1487#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1488 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1489 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1490
1491static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1492 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1493 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1495}
1496
1497static __inline__ __m128h __DEFAULT_FN_ATTRS512
1498_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1499 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1500 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1501}
1502
1503static __inline__ __m128h __DEFAULT_FN_ATTRS512
1504_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1505 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1506 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1508}
1509
1510#define _mm512_cvt_roundph_pd(A, R) \
1511 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1512 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1513
1514#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1515 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1516 (__mmask8)(U), (int)(R)))
1517
1518#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1519 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1520 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1521
1522static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1523 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1524 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1526}
1527
1528static __inline__ __m512d __DEFAULT_FN_ATTRS512
1529_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1530 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1531 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1532}
1533
1534static __inline__ __m512d __DEFAULT_FN_ATTRS512
1535_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1536 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1537 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1539}
1540
1541#define _mm_cvt_roundsh_ss(A, B, R) \
1542 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1543 (__v4sf)_mm_undefined_ps(), \
1544 (__mmask8)(-1), (int)(R)))
1545
1546#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1547 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1548 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1549
1550#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1551 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1552 (__v4sf)_mm_setzero_ps(), \
1553 (__mmask8)(U), (int)(R)))
1554
1555static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1556 __m128h __B) {
1557 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1558 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1560}
1561
1562static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1563 __mmask8 __U,
1564 __m128 __A,
1565 __m128h __B) {
1566 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1567 (__v4sf)__W, (__mmask8)__U,
1569}
1570
1571static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1572 __m128 __A,
1573 __m128h __B) {
1574 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1575 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1577}
1578
1579#define _mm_cvt_roundss_sh(A, B, R) \
1580 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1581 (__v8hf)_mm_undefined_ph(), \
1582 (__mmask8)(-1), (int)(R)))
1583
1584#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1585 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1586 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1587
1588#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1589 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1590 (__v8hf)_mm_setzero_ph(), \
1591 (__mmask8)(U), (int)(R)))
1592
1593static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1594 __m128 __B) {
1595 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1596 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1598}
1599
1600static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1601 __mmask8 __U,
1602 __m128h __A,
1603 __m128 __B) {
1604 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1605 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1607}
1608
1609static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1610 __m128h __A,
1611 __m128 __B) {
1612 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1613 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1615}
1616
1617#define _mm_cvt_roundsd_sh(A, B, R) \
1618 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1619 (__v8hf)_mm_undefined_ph(), \
1620 (__mmask8)(-1), (int)(R)))
1621
1622#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1623 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1624 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1625
1626#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1627 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1628 (__v8hf)_mm_setzero_ph(), \
1629 (__mmask8)(U), (int)(R)))
1630
1631static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1632 __m128d __B) {
1633 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1634 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1636}
1637
1638static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1639 __mmask8 __U,
1640 __m128h __A,
1641 __m128d __B) {
1642 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1643 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1645}
1646
1647static __inline__ __m128h __DEFAULT_FN_ATTRS128
1648_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1649 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1650 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1652}
1653
1654#define _mm_cvt_roundsh_sd(A, B, R) \
1655 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1656 (__v2df)_mm_undefined_pd(), \
1657 (__mmask8)(-1), (int)(R)))
1658
1659#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1660 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1661 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1662
1663#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1664 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1665 (__v2df)_mm_setzero_pd(), \
1666 (__mmask8)(U), (int)(R)))
1667
1668static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1669 __m128h __B) {
1670 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1671 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1673}
1674
1675static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1676 __mmask8 __U,
1677 __m128d __A,
1678 __m128h __B) {
1679 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1680 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1682}
1683
1684static __inline__ __m128d __DEFAULT_FN_ATTRS128
1685_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1686 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1687 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1689}
1690
1691#define _mm512_cvt_roundph_epi16(A, R) \
1692 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1693 (__v32hi)_mm512_undefined_epi32(), \
1694 (__mmask32)(-1), (int)(R)))
1695
1696#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1697 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1698 (__mmask32)(U), (int)(R)))
1699
1700#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1701 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1702 (__v32hi)_mm512_setzero_epi32(), \
1703 (__mmask32)(U), (int)(R)))
1704
1705static __inline__ __m512i __DEFAULT_FN_ATTRS512
1706_mm512_cvtph_epi16(__m512h __A) {
1707 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1708 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1710}
1711
1712static __inline__ __m512i __DEFAULT_FN_ATTRS512
1713_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1714 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1715 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1716}
1717
1718static __inline__ __m512i __DEFAULT_FN_ATTRS512
1719_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1720 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1721 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1723}
1724
1725#define _mm512_cvtt_roundph_epi16(A, R) \
1726 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1727 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1728 (int)(R)))
1729
1730#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1731 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1732 (__mmask32)(U), (int)(R)))
1733
1734#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1735 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1736 (__v32hi)_mm512_setzero_epi32(), \
1737 (__mmask32)(U), (int)(R)))
1738
1739static __inline__ __m512i __DEFAULT_FN_ATTRS512
1740_mm512_cvttph_epi16(__m512h __A) {
1741 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1742 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1744}
1745
1746static __inline__ __m512i __DEFAULT_FN_ATTRS512
1747_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1748 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1749 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1750}
1751
1752static __inline__ __m512i __DEFAULT_FN_ATTRS512
1753_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1754 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1755 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1757}
1758
1759#define _mm512_cvt_roundepi16_ph(A, R) \
1760 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1761 (__v32hf)_mm512_undefined_ph(), \
1762 (__mmask32)(-1), (int)(R)))
1763
1764#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1765 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1766 (__mmask32)(U), (int)(R)))
1767
1768#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1769 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1770 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1771
1772static __inline__ __m512h __DEFAULT_FN_ATTRS512
1773_mm512_cvtepi16_ph(__m512i __A) {
1774 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1775 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1777}
1778
1779static __inline__ __m512h __DEFAULT_FN_ATTRS512
1780_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1781 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1782 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1783}
1784
1785static __inline__ __m512h __DEFAULT_FN_ATTRS512
1786_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1787 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1788 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1790}
1791
1792#define _mm512_cvt_roundph_epu16(A, R) \
1793 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1794 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1795 (int)(R)))
1796
1797#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1798 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1799 (__mmask32)(U), (int)(R)))
1800
1801#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1802 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1803 (__v32hu)_mm512_setzero_epi32(), \
1804 (__mmask32)(U), (int)(R)))
1805
1806static __inline__ __m512i __DEFAULT_FN_ATTRS512
1807_mm512_cvtph_epu16(__m512h __A) {
1808 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1809 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1811}
1812
1813static __inline__ __m512i __DEFAULT_FN_ATTRS512
1814_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1815 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1816 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1817}
1818
1819static __inline__ __m512i __DEFAULT_FN_ATTRS512
1820_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1821 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1822 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1824}
1825
1826#define _mm512_cvtt_roundph_epu16(A, R) \
1827 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1828 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1829 (int)(R)))
1830
1831#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1832 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1833 (__mmask32)(U), (int)(R)))
1834
1835#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1836 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1837 (__v32hu)_mm512_setzero_epi32(), \
1838 (__mmask32)(U), (int)(R)))
1839
1840static __inline__ __m512i __DEFAULT_FN_ATTRS512
1841_mm512_cvttph_epu16(__m512h __A) {
1842 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1843 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1845}
1846
1847static __inline__ __m512i __DEFAULT_FN_ATTRS512
1848_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1849 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1850 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1851}
1852
1853static __inline__ __m512i __DEFAULT_FN_ATTRS512
1854_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1855 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1856 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1858}
1859
1860#define _mm512_cvt_roundepu16_ph(A, R) \
1861 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1862 (__v32hf)_mm512_undefined_ph(), \
1863 (__mmask32)(-1), (int)(R)))
1864
1865#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1866 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1867 (__mmask32)(U), (int)(R)))
1868
1869#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1870 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1871 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1872
1873static __inline__ __m512h __DEFAULT_FN_ATTRS512
1874_mm512_cvtepu16_ph(__m512i __A) {
1875 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1876 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1878}
1879
1880static __inline__ __m512h __DEFAULT_FN_ATTRS512
1881_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1882 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1883 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1884}
1885
1886static __inline__ __m512h __DEFAULT_FN_ATTRS512
1887_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1888 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1889 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1891}
1892
1893#define _mm512_cvt_roundph_epi32(A, R) \
1894 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1895 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1896 (int)(R)))
1897
1898#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1899 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1900 (__mmask16)(U), (int)(R)))
1901
1902#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1904 (__v16si)_mm512_setzero_epi32(), \
1905 (__mmask16)(U), (int)(R)))
1906
1907static __inline__ __m512i __DEFAULT_FN_ATTRS512
1908_mm512_cvtph_epi32(__m256h __A) {
1909 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1910 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1912}
1913
1914static __inline__ __m512i __DEFAULT_FN_ATTRS512
1915_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1916 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1917 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1918}
1919
1920static __inline__ __m512i __DEFAULT_FN_ATTRS512
1921_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1922 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1923 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1925}
1926
1927#define _mm512_cvt_roundph_epu32(A, R) \
1928 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1929 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1930 (int)(R)))
1931
1932#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1933 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1934 (__mmask16)(U), (int)(R)))
1935
1936#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1937 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1938 (__v16su)_mm512_setzero_epi32(), \
1939 (__mmask16)(U), (int)(R)))
1940
1941static __inline__ __m512i __DEFAULT_FN_ATTRS512
1942_mm512_cvtph_epu32(__m256h __A) {
1943 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1944 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1946}
1947
1948static __inline__ __m512i __DEFAULT_FN_ATTRS512
1949_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1950 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1951 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1952}
1953
1954static __inline__ __m512i __DEFAULT_FN_ATTRS512
1955_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1956 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1957 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1959}
1960
1961#define _mm512_cvt_roundepi32_ph(A, R) \
1962 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1963 (__v16hf)_mm256_undefined_ph(), \
1964 (__mmask16)(-1), (int)(R)))
1965
1966#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1967 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1968 (__mmask16)(U), (int)(R)))
1969
1970#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1971 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1972 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1973
1974static __inline__ __m256h __DEFAULT_FN_ATTRS512
1975_mm512_cvtepi32_ph(__m512i __A) {
1976 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1977 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1979}
1980
1981static __inline__ __m256h __DEFAULT_FN_ATTRS512
1982_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1983 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1984 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1985}
1986
1987static __inline__ __m256h __DEFAULT_FN_ATTRS512
1988_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1989 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1990 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1992}
1993
1994#define _mm512_cvt_roundepu32_ph(A, R) \
1995 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1996 (__v16hf)_mm256_undefined_ph(), \
1997 (__mmask16)(-1), (int)(R)))
1998
1999#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
2000 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
2001 (__mmask16)(U), (int)(R)))
2002
2003#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
2004 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
2005 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2006
2007static __inline__ __m256h __DEFAULT_FN_ATTRS512
2008_mm512_cvtepu32_ph(__m512i __A) {
2009 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2010 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2012}
2013
2014static __inline__ __m256h __DEFAULT_FN_ATTRS512
2015_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
2016 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2017 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2018}
2019
2020static __inline__ __m256h __DEFAULT_FN_ATTRS512
2021_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2022 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2023 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2025}
2026
2027#define _mm512_cvtt_roundph_epi32(A, R) \
2028 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2029 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2030 (int)(R)))
2031
2032#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2033 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2034 (__mmask16)(U), (int)(R)))
2035
2036#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2038 (__v16si)_mm512_setzero_epi32(), \
2039 (__mmask16)(U), (int)(R)))
2040
2041static __inline__ __m512i __DEFAULT_FN_ATTRS512
2042_mm512_cvttph_epi32(__m256h __A) {
2043 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2044 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2046}
2047
2048static __inline__ __m512i __DEFAULT_FN_ATTRS512
2049_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2050 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2051 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2052}
2053
2054static __inline__ __m512i __DEFAULT_FN_ATTRS512
2055_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2056 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2057 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2059}
2060
2061#define _mm512_cvtt_roundph_epu32(A, R) \
2062 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2063 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2064 (int)(R)))
2065
2066#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2067 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2068 (__mmask16)(U), (int)(R)))
2069
2070#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2071 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2072 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2073 (int)(R)))
2074
2075static __inline__ __m512i __DEFAULT_FN_ATTRS512
2076_mm512_cvttph_epu32(__m256h __A) {
2077 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2078 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2080}
2081
2082static __inline__ __m512i __DEFAULT_FN_ATTRS512
2083_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2084 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2085 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2086}
2087
2088static __inline__ __m512i __DEFAULT_FN_ATTRS512
2089_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2090 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2091 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2093}
2094
2095#define _mm512_cvt_roundepi64_ph(A, R) \
2096 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2097 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2098
2099#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2100 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2101 (__mmask8)(U), (int)(R)))
2102
2103#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2104 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2105 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2106
2107static __inline__ __m128h __DEFAULT_FN_ATTRS512
2108_mm512_cvtepi64_ph(__m512i __A) {
2109 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2110 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2112}
2113
2114static __inline__ __m128h __DEFAULT_FN_ATTRS512
2115_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2116 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2117 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2118}
2119
2120static __inline__ __m128h __DEFAULT_FN_ATTRS512
2121_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2122 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2123 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2125}
2126
2127#define _mm512_cvt_roundph_epi64(A, R) \
2128 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2129 (__v8di)_mm512_undefined_epi32(), \
2130 (__mmask8)(-1), (int)(R)))
2131
2132#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2133 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2134 (__mmask8)(U), (int)(R)))
2135
2136#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2137 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2138 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2139
2140static __inline__ __m512i __DEFAULT_FN_ATTRS512
2141_mm512_cvtph_epi64(__m128h __A) {
2142 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2143 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2145}
2146
2147static __inline__ __m512i __DEFAULT_FN_ATTRS512
2148_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2149 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2150 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2151}
2152
2153static __inline__ __m512i __DEFAULT_FN_ATTRS512
2154_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2155 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2156 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2158}
2159
2160#define _mm512_cvt_roundepu64_ph(A, R) \
2161 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2162 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2163
2164#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2165 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2166 (__mmask8)(U), (int)(R)))
2167
2168#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2169 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2170 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2171
2172static __inline__ __m128h __DEFAULT_FN_ATTRS512
2173_mm512_cvtepu64_ph(__m512i __A) {
2174 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2175 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2177}
2178
2179static __inline__ __m128h __DEFAULT_FN_ATTRS512
2180_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2181 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2182 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2183}
2184
2185static __inline__ __m128h __DEFAULT_FN_ATTRS512
2186_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2187 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2188 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2190}
2191
2192#define _mm512_cvt_roundph_epu64(A, R) \
2193 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2194 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2195 (int)(R)))
2196
2197#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2198 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2199 (__mmask8)(U), (int)(R)))
2200
2201#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2202 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2203 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2204
2205static __inline__ __m512i __DEFAULT_FN_ATTRS512
2206_mm512_cvtph_epu64(__m128h __A) {
2207 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2208 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2210}
2211
2212static __inline__ __m512i __DEFAULT_FN_ATTRS512
2213_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2214 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2215 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2216}
2217
2218static __inline__ __m512i __DEFAULT_FN_ATTRS512
2219_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2220 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2221 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2223}
2224
2225#define _mm512_cvtt_roundph_epi64(A, R) \
2226 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2227 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2228 (int)(R)))
2229
2230#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2231 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2232 (__mmask8)(U), (int)(R)))
2233
2234#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2235 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2236 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2237
2238static __inline__ __m512i __DEFAULT_FN_ATTRS512
2239_mm512_cvttph_epi64(__m128h __A) {
2240 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2241 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2243}
2244
2245static __inline__ __m512i __DEFAULT_FN_ATTRS512
2246_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2247 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2248 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2249}
2250
2251static __inline__ __m512i __DEFAULT_FN_ATTRS512
2252_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2253 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2254 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2256}
2257
2258#define _mm512_cvtt_roundph_epu64(A, R) \
2259 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2260 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2261 (int)(R)))
2262
2263#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2264 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2265 (__mmask8)(U), (int)(R)))
2266
2267#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2268 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2269 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2270
2271static __inline__ __m512i __DEFAULT_FN_ATTRS512
2272_mm512_cvttph_epu64(__m128h __A) {
2273 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2274 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2276}
2277
2278static __inline__ __m512i __DEFAULT_FN_ATTRS512
2279_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2280 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2281 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2282}
2283
2284static __inline__ __m512i __DEFAULT_FN_ATTRS512
2285_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2286 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2287 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2289}
2290
2291#define _mm_cvt_roundsh_i32(A, R) \
2292 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2293
2294static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2295 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2296}
2297
2298#define _mm_cvt_roundsh_u32(A, R) \
2299 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2300
2301static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2302_mm_cvtsh_u32(__m128h __A) {
2303 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2305}
2306
2307#ifdef __x86_64__
2308#define _mm_cvt_roundsh_i64(A, R) \
2309 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2310
2311static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2312 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2314}
2315
2316#define _mm_cvt_roundsh_u64(A, R) \
2317 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2318
2319static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2320_mm_cvtsh_u64(__m128h __A) {
2321 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2322 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2323}
2324#endif // __x86_64__
2325
2326#define _mm_cvt_roundu32_sh(A, B, R) \
2327 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2328
2329static __inline__ __m128h __DEFAULT_FN_ATTRS128
2330_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2331 __A[0] = __B;
2332 return __A;
2333}
2334
2335#ifdef __x86_64__
2336#define _mm_cvt_roundu64_sh(A, B, R) \
2337 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2338 (int)(R)))
2339
2340static __inline__ __m128h __DEFAULT_FN_ATTRS128
2341_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2342 __A[0] = __B;
2343 return __A;
2344}
2345#endif
2346
2347#define _mm_cvt_roundi32_sh(A, B, R) \
2348 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2349
2350static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2351 int __B) {
2352 __A[0] = __B;
2353 return __A;
2354}
2355
2356#ifdef __x86_64__
2357#define _mm_cvt_roundi64_sh(A, B, R) \
2358 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2359
2360static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2361 long long __B) {
2362 __A[0] = __B;
2363 return __A;
2364}
2365#endif
2366
2367#define _mm_cvtt_roundsh_i32(A, R) \
2368 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2369
2370static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2371 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2373}
2374
2375#ifdef __x86_64__
2376#define _mm_cvtt_roundsh_i64(A, R) \
2377 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2378
2379static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2380 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2382}
2383#endif
2384
2385#define _mm_cvtt_roundsh_u32(A, R) \
2386 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2387
2388static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2389_mm_cvttsh_u32(__m128h __A) {
2390 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2392}
2393
2394#ifdef __x86_64__
2395#define _mm_cvtt_roundsh_u64(A, R) \
2396 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2397
2398static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2399_mm_cvttsh_u64(__m128h __A) {
2400 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2401 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2402}
2403#endif
2404
2405#define _mm512_cvtx_roundph_ps(A, R) \
2406 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2407 (__v16sf)_mm512_undefined_ps(), \
2408 (__mmask16)(-1), (int)(R)))
2409
2410#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2411 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2412 (__mmask16)(U), (int)(R)))
2413
2414#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2415 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2416 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2417
2418static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2419 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2420 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2422}
2423
2424static __inline__ __m512 __DEFAULT_FN_ATTRS512
2425_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2426 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2427 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2428}
2429
2430static __inline__ __m512 __DEFAULT_FN_ATTRS512
2431_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2432 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2433 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2435}
2436
2437#define _mm512_cvtx_roundps_ph(A, R) \
2438 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2439 (__v16hf)_mm256_undefined_ph(), \
2440 (__mmask16)(-1), (int)(R)))
2441
2442#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2443 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2444 (__mmask16)(U), (int)(R)))
2445
2446#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2447 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2448 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2449
2450static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2451 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2452 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2454}
2455
2456static __inline__ __m256h __DEFAULT_FN_ATTRS512
2457_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2458 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2459 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2460}
2461
2462static __inline__ __m256h __DEFAULT_FN_ATTRS512
2463_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2464 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2465 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2467}
2468
2469#define _mm512_fmadd_round_ph(A, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2472 (__mmask32)-1, (int)(R)))
2473
2474#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2478
2479#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2481 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)(U), (int)(R)))
2483
2484#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2486 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2488
2489#define _mm512_fmsub_round_ph(A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2491 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2492 (__mmask32)-1, (int)(R)))
2493
2494#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)(U), (int)(R)))
2498
2499#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2503
2504#define _mm512_fnmadd_round_ph(A, B, C, R) \
2505 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2506 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2507 (__mmask32)-1, (int)(R)))
2508
2509#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2510 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2511 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2512 (__mmask32)(U), (int)(R)))
2513
2514#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2515 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2516 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2517 (__mmask32)(U), (int)(R)))
2518
2519#define _mm512_fnmsub_round_ph(A, B, C, R) \
2520 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2521 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2522 (__mmask32)-1, (int)(R)))
2523
2524#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2525 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2526 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2527 (__mmask32)(U), (int)(R)))
2528
2529static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2530 __m512h __B,
2531 __m512h __C) {
2532 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2533 (__v32hf)__C, (__mmask32)-1,
2535}
2536
2537static __inline__ __m512h __DEFAULT_FN_ATTRS512
2538_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2539 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2540 (__v32hf)__C, (__mmask32)__U,
2542}
2543
2544static __inline__ __m512h __DEFAULT_FN_ATTRS512
2545_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2546 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2547 (__v32hf)__C, (__mmask32)__U,
2549}
2550
2551static __inline__ __m512h __DEFAULT_FN_ATTRS512
2552_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2553 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2554 (__v32hf)__C, (__mmask32)__U,
2556}
2557
2558static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2559 __m512h __B,
2560 __m512h __C) {
2561 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2562 -(__v32hf)__C, (__mmask32)-1,
2564}
2565
2566static __inline__ __m512h __DEFAULT_FN_ATTRS512
2567_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2568 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2569 -(__v32hf)__C, (__mmask32)__U,
2571}
2572
2573static __inline__ __m512h __DEFAULT_FN_ATTRS512
2574_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2575 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2576 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2578}
2579
2580static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2581 __m512h __B,
2582 __m512h __C) {
2583 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2584 (__v32hf)__C, (__mmask32)-1,
2586}
2587
2588static __inline__ __m512h __DEFAULT_FN_ATTRS512
2589_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2590 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2591 (__v32hf)__C, (__mmask32)__U,
2593}
2594
2595static __inline__ __m512h __DEFAULT_FN_ATTRS512
2596_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2597 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2598 (__v32hf)__C, (__mmask32)__U,
2600}
2601
2602static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2603 __m512h __B,
2604 __m512h __C) {
2605 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2606 -(__v32hf)__C, (__mmask32)-1,
2608}
2609
2610static __inline__ __m512h __DEFAULT_FN_ATTRS512
2611_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2612 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2613 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2615}
2616
2617#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2620 (__mmask32)-1, (int)(R)))
2621
2622#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2626
2627#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2628 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2629 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2630 (__mmask32)(U), (int)(R)))
2631
2632#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2633 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2634 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2635 (__mmask32)(U), (int)(R)))
2636
2637#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2638 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2639 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2640 (__mmask32)-1, (int)(R)))
2641
2642#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2643 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2644 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2645 (__mmask32)(U), (int)(R)))
2646
2647#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2648 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2649 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2650 (__mmask32)(U), (int)(R)))
2651
2652static __inline__ __m512h __DEFAULT_FN_ATTRS512
2653_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2654 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2655 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2657}
2658
2659static __inline__ __m512h __DEFAULT_FN_ATTRS512
2660_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2661 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2662 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2664}
2665
2666static __inline__ __m512h __DEFAULT_FN_ATTRS512
2667_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2668 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2669 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2671}
2672
2673static __inline__ __m512h __DEFAULT_FN_ATTRS512
2674_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2675 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2676 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2678}
2679
2680static __inline__ __m512h __DEFAULT_FN_ATTRS512
2681_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2682 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2683 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2685}
2686
2687static __inline__ __m512h __DEFAULT_FN_ATTRS512
2688_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2689 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2690 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2692}
2693
2694static __inline__ __m512h __DEFAULT_FN_ATTRS512
2695_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2696 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2697 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2699}
2700
2701#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2702 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2703 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2704 (__mmask32)(U), (int)(R)))
2705
2706static __inline__ __m512h __DEFAULT_FN_ATTRS512
2707_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2708 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2709 (__v32hf)__C, (__mmask32)__U,
2711}
2712
2713#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2714 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2715 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2716 (__mmask32)(U), (int)(R)))
2717
2718static __inline__ __m512h __DEFAULT_FN_ATTRS512
2719_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2720 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2721 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2723}
2724
2725#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2726 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2727 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2728 (__mmask32)(U), (int)(R)))
2729
2730static __inline__ __m512h __DEFAULT_FN_ATTRS512
2731_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2732 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2733 (__v32hf)__C, (__mmask32)__U,
2735}
2736
2737#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2738 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2739 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2740 (__mmask32)(U), (int)(R)))
2741
2742#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2743 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2744 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2745 (__mmask32)(U), (int)(R)))
2746
2747static __inline__ __m512h __DEFAULT_FN_ATTRS512
2748_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2749 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2750 -(__v32hf)__C, (__mmask32)__U,
2752}
2753
2754static __inline__ __m512h __DEFAULT_FN_ATTRS512
2755_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2756 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2757 (__v32hf)__C, (__mmask32)__U,
2759}
2760
2761static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2762 __m128h __A,
2763 __m128h __B) {
2764 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2766}
2767
2768static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2769 __mmask8 __U,
2770 __m128h __A,
2771 __m128h __B) {
2772 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2774}
2775
2776#define _mm_fmadd_round_sh(A, B, C, R) \
2777 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2778 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2779 (__mmask8)-1, (int)(R)))
2780
2781#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2782 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2783 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2784 (__mmask8)(U), (int)(R)))
2785
2786static __inline__ __m128h __DEFAULT_FN_ATTRS128
2787_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2788 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2789 (__mmask8)__U,
2791}
2792
2793#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2794 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2795 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2796 (__mmask8)(U), (int)(R)))
2797
2798static __inline__ __m128h __DEFAULT_FN_ATTRS128
2799_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2800 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2801 (__mmask8)__U,
2803}
2804
2805#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2806 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2807 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2808 (__mmask8)(U), (int)(R)))
2809
2810static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2811 __m128h __A,
2812 __m128h __B) {
2813 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2814 -(__v8hf)__B, (__mmask8)-1,
2816}
2817
2818static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2819 __mmask8 __U,
2820 __m128h __A,
2821 __m128h __B) {
2822 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2823 -(__v8hf)__B, (__mmask8)__U,
2825}
2826
2827#define _mm_fmsub_round_sh(A, B, C, R) \
2828 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2829 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2830 (__mmask8)-1, (int)(R)))
2831
2832#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2833 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2834 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2835 (__mmask8)(U), (int)(R)))
2836
2837static __inline__ __m128h __DEFAULT_FN_ATTRS128
2838_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2839 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2840 -(__v8hf)__C, (__mmask8)__U,
2842}
2843
2844#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2845 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2846 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2847 (__mmask8)(U), (int)R))
2848
2849static __inline__ __m128h __DEFAULT_FN_ATTRS128
2850_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2851 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2852 (__mmask8)__U,
2854}
2855
2856#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2857 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2858 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2859 (__mmask8)(U), (int)(R)))
2860
2861static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2862 __m128h __A,
2863 __m128h __B) {
2864 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2866}
2867
2868static __inline__ __m128h __DEFAULT_FN_ATTRS128
2869_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2870 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2872}
2873
2874#define _mm_fnmadd_round_sh(A, B, C, R) \
2875 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2876 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2877 (__mmask8)-1, (int)(R)))
2878
2879#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2880 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2881 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2882 (__mmask8)(U), (int)(R)))
2883
2884static __inline__ __m128h __DEFAULT_FN_ATTRS128
2885_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2886 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2887 (__mmask8)__U,
2889}
2890
2891#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2892 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2893 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2894 (__mmask8)(U), (int)(R)))
2895
2896static __inline__ __m128h __DEFAULT_FN_ATTRS128
2897_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2898 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2899 (__mmask8)__U,
2901}
2902
2903#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2904 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2905 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2906 (__mmask8)(U), (int)(R)))
2907
2908static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2909 __m128h __A,
2910 __m128h __B) {
2911 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2913}
2914
2915static __inline__ __m128h __DEFAULT_FN_ATTRS128
2916_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2917 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2919}
2920
2921#define _mm_fnmsub_round_sh(A, B, C, R) \
2922 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2923 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2924 (__mmask8)-1, (int)(R)))
2925
2926#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2927 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2928 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2929 (__mmask8)(U), (int)(R)))
2930
2931static __inline__ __m128h __DEFAULT_FN_ATTRS128
2932_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2933 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2934 (__mmask8)__U,
2936}
2937
2938#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2939 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2940 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2941 (__mmask8)(U), (int)(R)))
2942
2943static __inline__ __m128h __DEFAULT_FN_ATTRS128
2944_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2945 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2946 (__mmask8)__U,
2948}
2949
2950#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2951 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2952 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2953 (__mmask8)(U), (int)(R)))
2954
2955static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2956 __m128h __B,
2957 __m128h __C) {
2958 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2959 (__v4sf)__C, (__mmask8)-1,
2961}
2962
2963static __inline__ __m128h __DEFAULT_FN_ATTRS128
2964_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2965 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2966 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2967}
2968
2969static __inline__ __m128h __DEFAULT_FN_ATTRS128
2970_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2971 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2972 (__v4sf)__C, (__mmask8)__U,
2974}
2975
2976static __inline__ __m128h __DEFAULT_FN_ATTRS128
2977_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2978 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2979 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2980}
2981
2982#define _mm_fcmadd_round_sch(A, B, C, R) \
2983 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2984 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2985 (__mmask8)-1, (int)(R)))
2986
2987#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2988 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2989 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2990 (__mmask8)(U), (int)(R)))
2991
2992#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2993 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2994 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2995 (__mmask8)(U), (int)(R)))
2996
2997#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2998 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2999 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3000 (__mmask8)(U), (int)(R)))
3001
3002static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
3003 __m128h __B,
3004 __m128h __C) {
3005 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
3006 (__v4sf)__C, (__mmask8)-1,
3008}
3009
3010static __inline__ __m128h __DEFAULT_FN_ATTRS128
3011_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
3012 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
3013 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
3014}
3015
3016static __inline__ __m128h __DEFAULT_FN_ATTRS128
3017_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3018 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3019 (__v4sf)__C, (__mmask8)__U,
3021}
3022
3023static __inline__ __m128h __DEFAULT_FN_ATTRS128
3024_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3025 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3026 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3027}
3028
3029#define _mm_fmadd_round_sch(A, B, C, R) \
3030 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3031 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3032 (__mmask8)-1, (int)(R)))
3033
3034#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3035 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3036 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3037 (__mmask8)(U), (int)(R)))
3038
3039#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3040 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3041 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3042 (__mmask8)(U), (int)(R)))
3043
3044#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3045 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3046 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3047 (__mmask8)(U), (int)(R)))
3048
3049static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3050 __m128h __B) {
3051 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3052 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3054}
3055
3056static __inline__ __m128h __DEFAULT_FN_ATTRS128
3057_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3058 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3059 (__v4sf)__W, (__mmask8)__U,
3061}
3062
3063static __inline__ __m128h __DEFAULT_FN_ATTRS128
3064_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3065 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3066 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3068}
3069
3070#define _mm_fcmul_round_sch(A, B, R) \
3071 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3072 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3073 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3074
3075#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3076 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3077 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3078 (__mmask8)(U), (int)(R)))
3079
3080#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3081 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3082 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3083 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3084
3085static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3086 __m128h __B) {
3087 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3088 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3090}
3091
3092static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3093 __mmask8 __U,
3094 __m128h __A,
3095 __m128h __B) {
3096 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3097 (__v4sf)__W, (__mmask8)__U,
3099}
3100
3101static __inline__ __m128h __DEFAULT_FN_ATTRS128
3102_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3103 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3104 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3106}
3107
3108#define _mm_fmul_round_sch(A, B, R) \
3109 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3110 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3111 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3112
3113#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3114 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3115 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3116 (__mmask8)(U), (int)(R)))
3117
3118#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3119 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3120 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3121 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3122
3123static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3124 __m512h __B) {
3125 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3126 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3128}
3129
3130static __inline__ __m512h __DEFAULT_FN_ATTRS512
3131_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3132 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3133 (__v16sf)__W, (__mmask16)__U,
3135}
3136
3137static __inline__ __m512h __DEFAULT_FN_ATTRS512
3138_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3139 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3140 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3142}
3143
3144#define _mm512_fcmul_round_pch(A, B, R) \
3145 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3146 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3147 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3148
3149#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3150 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3151 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3152 (__mmask16)(U), (int)(R)))
3153
3154#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3155 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3156 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3157 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3158
3159static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3160 __m512h __B) {
3161 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3162 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3164}
3165
3166static __inline__ __m512h __DEFAULT_FN_ATTRS512
3167_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3168 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3169 (__v16sf)__W, (__mmask16)__U,
3171}
3172
3173static __inline__ __m512h __DEFAULT_FN_ATTRS512
3174_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3175 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3176 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3178}
3179
3180#define _mm512_fmul_round_pch(A, B, R) \
3181 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3182 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3183 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3184
3185#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3186 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3187 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3188 (__mmask16)(U), (int)(R)))
3189
3190#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3191 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3192 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3193 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3194
3195static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3196 __m512h __B,
3197 __m512h __C) {
3198 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3199 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3201}
3202
3203static __inline__ __m512h __DEFAULT_FN_ATTRS512
3204_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3205 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3206 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3208}
3209
3210static __inline__ __m512h __DEFAULT_FN_ATTRS512
3211_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3212 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3213 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3215}
3216
3217static __inline__ __m512h __DEFAULT_FN_ATTRS512
3218_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3219 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3220 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3222}
3223
3224#define _mm512_fcmadd_round_pch(A, B, C, R) \
3225 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3226 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3227 (__mmask16)-1, (int)(R)))
3228
3229#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3230 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3231 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3232 (__mmask16)(U), (int)(R)))
3233
3234#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3235 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3236 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3237 (__mmask16)(U), (int)(R)))
3238
3239#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3240 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3241 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3242 (__mmask16)(U), (int)(R)))
3243
3244static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3245 __m512h __B,
3246 __m512h __C) {
3247 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3248 (__v16sf)__C, (__mmask16)-1,
3250}
3251
3252static __inline__ __m512h __DEFAULT_FN_ATTRS512
3253_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3254 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3255 (__v16sf)__C, (__mmask16)__U,
3257}
3258
3259static __inline__ __m512h __DEFAULT_FN_ATTRS512
3260_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3261 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3262 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3264}
3265
3266static __inline__ __m512h __DEFAULT_FN_ATTRS512
3267_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3268 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3269 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3271}
3272
3273#define _mm512_fmadd_round_pch(A, B, C, R) \
3274 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3275 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3276 (__mmask16)-1, (int)(R)))
3277
3278#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3279 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3280 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3281 (__mmask16)(U), (int)(R)))
3282
3283#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3284 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3285 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3286 (__mmask16)(U), (int)(R)))
3287
3288#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3289 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3290 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3291 (__mmask16)(U), (int)(R)))
3292
3293static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3294_mm512_reduce_add_ph(__m512h __W) {
3295 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3296}
3297
3298static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3299_mm512_reduce_mul_ph(__m512h __W) {
3300 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3301}
3302
3303static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3304_mm512_reduce_max_ph(__m512h __V) {
3305 return __builtin_ia32_reduce_fmax_ph512(__V);
3306}
3307
3308static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3309_mm512_reduce_min_ph(__m512h __V) {
3310 return __builtin_ia32_reduce_fmin_ph512(__V);
3311}
3312
3313static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3314_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3315 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3316 (__v32hf)__A);
3317}
3318
3319static __inline__ __m512h __DEFAULT_FN_ATTRS512
3320_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3321 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3322 (__v32hi)__B);
3323}
3324
3325static __inline__ __m512h __DEFAULT_FN_ATTRS512
3326_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3327 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3328}
3329
3330// intrinsics below are alias for f*mul_*ch
3331#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3332#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3333#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3334#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3335#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3336 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3337#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3338 _mm512_maskz_fmul_round_pch(U, A, B, R)
3339
3340#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3341#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3342#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3343#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3344#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3345 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3346#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3347 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3348
3349#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3350#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3351#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3352#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3353#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3354 _mm_mask_fmul_round_sch(W, U, A, B, R)
3355#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3356
3357#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3358#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3359#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3360#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3361#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3362 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3363#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3364 _mm_maskz_fcmul_round_sch(U, A, B, R)
3365
3366#undef __DEFAULT_FN_ATTRS128
3367#undef __DEFAULT_FN_ATTRS256
3368#undef __DEFAULT_FN_ATTRS512
3369#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3370#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3371#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
3372
3373#endif
3374#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ _Float16
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
unsigned char __mmask8
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
unsigned short __mmask16
#define _MM_FROUND_CUR_DIRECTION
#define _CMP_GT_OS
Definition avxintrin.h:1590
#define _CMP_GE_OS
Definition avxintrin.h:1589
#define _CMP_GT_OQ
Definition avxintrin.h:1606
#define _CMP_LE_OQ
Definition avxintrin.h:1594
#define _CMP_LT_OQ
Definition avxintrin.h:1593
#define _CMP_NEQ_US
Definition avxintrin.h:1596
#define _CMP_EQ_OS
Definition avxintrin.h:1592
#define _CMP_GE_OQ
Definition avxintrin.h:1605
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1765
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1899
#define _CMP_LE_OS
Definition xmmintrin.h:3027
#define _CMP_NEQ_UQ
Definition xmmintrin.h:3029
#define _CMP_LT_OS
Definition xmmintrin.h:3026
#define _CMP_EQ_OQ
Definition xmmintrin.h:3025
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021