clang 22.0.0git
avx512fp16intrin.h
Go to the documentation of this file.
1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifdef __SSE2__
14
15#ifndef __AVX512FP16INTRIN_H
16#define __AVX512FP16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23/* Define the default attributes for the functions in this file. */
24#define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27#define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30#define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
33
34#if defined(__cplusplus) && (__cplusplus >= 201103L)
35#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
36#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
37#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
38#else
39#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
40#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
41#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
42#endif
43
45_mm512_cvtsh_h(__m512h __a) {
46 return __a[0];
47}
48
49static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setzero_ph(void) {
50 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
51}
52
53static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
54_mm256_setzero_ph(void) {
55 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
56 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
57}
58
59static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
60 return (__m256h)__builtin_ia32_undef256();
61}
62
63static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
64_mm512_setzero_ph(void) {
65 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
66 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
67 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
68}
69
70static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
71 return (__m128h)__builtin_ia32_undef128();
72}
73
74static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
75 return (__m512h)__builtin_ia32_undef512();
76}
77
78static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
79_mm512_set1_ph(_Float16 __h) {
80 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
81 __h, __h, __h, __h, __h, __h, __h, __h,
82 __h, __h, __h, __h, __h, __h, __h, __h,
83 __h, __h, __h, __h, __h, __h, __h, __h};
84}
85
86static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
87_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
88 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
89 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
90 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
91 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
92 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
93 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
94 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
95 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
96 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
97 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
98 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
99 __h4, __h3, __h2, __h1};
100}
101
102static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setr_ph(
103 _Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4,
104 _Float16 e5, _Float16 e6, _Float16 e7, _Float16 e8, _Float16 e9,
105 _Float16 e10, _Float16 e11, _Float16 e12, _Float16 e13, _Float16 e14,
106 _Float16 e15, _Float16 e16, _Float16 e17, _Float16 e18, _Float16 e19,
107 _Float16 e20, _Float16 e21, _Float16 e22, _Float16 e23, _Float16 e24,
108 _Float16 e25, _Float16 e26, _Float16 e27, _Float16 e28, _Float16 e29,
109 _Float16 e30, _Float16 e31) {
110 return _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21,
111 e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10,
112 e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
113}
114
115static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
116_mm512_set1_pch(_Float16 _Complex __h) {
117 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
118}
119
120static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
121 return (__m128)__a;
122}
123
124static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
125 return (__m256)__a;
126}
127
128static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
129 return (__m512)__a;
130}
131
132static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
133 return (__m128d)__a;
134}
135
136static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
137 return (__m256d)__a;
138}
139
140static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
141 return (__m512d)__a;
142}
143
144static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
145 return (__m128i)__a;
146}
147
148static __inline__ __m256i __DEFAULT_FN_ATTRS256
149_mm256_castph_si256(__m256h __a) {
150 return (__m256i)__a;
151}
152
153static __inline__ __m512i __DEFAULT_FN_ATTRS512
154_mm512_castph_si512(__m512h __a) {
155 return (__m512i)__a;
156}
157
158static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
159 return (__m128h)__a;
160}
161
162static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
163 return (__m256h)__a;
164}
165
166static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
167 return (__m512h)__a;
168}
169
170static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
171 return (__m128h)__a;
172}
173
174static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
175 return (__m256h)__a;
176}
177
178static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
179 return (__m512h)__a;
180}
181
182static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
183 return (__m128h)__a;
184}
185
186static __inline__ __m256h __DEFAULT_FN_ATTRS256
187_mm256_castsi256_ph(__m256i __a) {
188 return (__m256h)__a;
189}
190
191static __inline__ __m512h __DEFAULT_FN_ATTRS512
192_mm512_castsi512_ph(__m512i __a) {
193 return (__m512h)__a;
194}
195
196static __inline__ __m128h __DEFAULT_FN_ATTRS256_CONSTEXPR
197_mm256_castph256_ph128(__m256h __a) {
198 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
199}
200
201static __inline__ __m128h __DEFAULT_FN_ATTRS512_CONSTEXPR
202_mm512_castph512_ph128(__m512h __a) {
203 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
204}
205
206static __inline__ __m256h __DEFAULT_FN_ATTRS512_CONSTEXPR
207_mm512_castph512_ph256(__m512h __a) {
208 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209 12, 13, 14, 15);
210}
211
212static __inline__ __m256h __DEFAULT_FN_ATTRS256
213_mm256_castph128_ph256(__m128h __a) {
214 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
215 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
216}
217
218static __inline__ __m512h __DEFAULT_FN_ATTRS512
219_mm512_castph128_ph512(__m128h __a) {
220 __m256h __b = __builtin_nondeterministic_value(__b);
221 return __builtin_shufflevector(
222 __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
223 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
224 __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
225 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
226}
227
228static __inline__ __m512h __DEFAULT_FN_ATTRS512
229_mm512_castph256_ph512(__m256h __a) {
230 return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
231 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
232 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
233 27, 28, 29, 30, 31);
234}
235
236/// Constructs a 256-bit floating-point vector of [16 x half] from a
237/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
238/// contain the value of the source vector. The upper 384 bits are set
239/// to zero.
240///
241/// \headerfile <x86intrin.h>
242///
243/// This intrinsic has no corresponding instruction.
244///
245/// \param __a
246/// A 128-bit vector of [8 x half].
247/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
248/// contain the value of the parameter. The upper 384 bits are set to zero.
249static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
250_mm256_zextph128_ph256(__m128h __a) {
251 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
252 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
253}
254
255/// Constructs a 512-bit floating-point vector of [32 x half] from a
256/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
257/// contain the value of the source vector. The upper 384 bits are set
258/// to zero.
259///
260/// \headerfile <x86intrin.h>
261///
262/// This intrinsic has no corresponding instruction.
263///
264/// \param __a
265/// A 128-bit vector of [8 x half].
266/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
267/// contain the value of the parameter. The upper 384 bits are set to zero.
268static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
269_mm512_zextph128_ph512(__m128h __a) {
270 return __builtin_shufflevector(
271 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
272 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
273}
274
275/// Constructs a 512-bit floating-point vector of [32 x half] from a
276/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
277/// contain the value of the source vector. The upper 256 bits are set
278/// to zero.
279///
280/// \headerfile <x86intrin.h>
281///
282/// This intrinsic has no corresponding instruction.
283///
284/// \param __a
285/// A 256-bit vector of [16 x half].
286/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
287/// contain the value of the parameter. The upper 256 bits are set to zero.
288static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
289_mm512_zextph256_ph512(__m256h __a) {
290 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
291 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
292 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
293 29, 30, 31);
294}
295
296#define _mm_comi_round_sh(A, B, P, R) \
297 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
298
299#define _mm_comi_sh(A, B, pred) \
300 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
301
302static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
303 __m128h __B) {
304 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
306}
307
308static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
309 __m128h __B) {
310 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
312}
313
314static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
315 __m128h __B) {
316 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
318}
319
320static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
321 __m128h __B) {
322 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
324}
325
326static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
327 __m128h __B) {
328 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
330}
331
332static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
333 __m128h __B) {
334 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
336}
337
338static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
339 __m128h __B) {
340 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
342}
343
344static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
345 __m128h __B) {
346 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
348}
349
350static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
351 __m128h __B) {
352 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
354}
355
356static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
357 __m128h __B) {
358 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
360}
361
362static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
363 __m128h __B) {
364 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
366}
367
368static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
369 __m128h __B) {
370 return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
372}
373
374static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
375 __m512h __B) {
376 return (__m512h)((__v32hf)__A + (__v32hf)__B);
377}
378
379static __inline__ __m512h __DEFAULT_FN_ATTRS512
380_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
381 return (__m512h)__builtin_ia32_selectph_512(
382 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
383}
384
385static __inline__ __m512h __DEFAULT_FN_ATTRS512
386_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
387 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
388 (__v32hf)_mm512_add_ph(__A, __B),
389 (__v32hf)_mm512_setzero_ph());
390}
391
392#define _mm512_add_round_ph(A, B, R) \
393 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
394 (__v32hf)(__m512h)(B), (int)(R)))
395
396#define _mm512_mask_add_round_ph(W, U, A, B, R) \
397 ((__m512h)__builtin_ia32_selectph_512( \
398 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
399 (__v32hf)(__m512h)(W)))
400
401#define _mm512_maskz_add_round_ph(U, A, B, R) \
402 ((__m512h)__builtin_ia32_selectph_512( \
403 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
404 (__v32hf)_mm512_setzero_ph()))
405
406static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
407 __m512h __B) {
408 return (__m512h)((__v32hf)__A - (__v32hf)__B);
409}
410
411static __inline__ __m512h __DEFAULT_FN_ATTRS512
412_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
413 return (__m512h)__builtin_ia32_selectph_512(
414 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
415}
416
417static __inline__ __m512h __DEFAULT_FN_ATTRS512
418_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
419 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
420 (__v32hf)_mm512_sub_ph(__A, __B),
421 (__v32hf)_mm512_setzero_ph());
422}
423
424#define _mm512_sub_round_ph(A, B, R) \
425 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
426 (__v32hf)(__m512h)(B), (int)(R)))
427
428#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
429 ((__m512h)__builtin_ia32_selectph_512( \
430 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
431 (__v32hf)(__m512h)(W)))
432
433#define _mm512_maskz_sub_round_ph(U, A, B, R) \
434 ((__m512h)__builtin_ia32_selectph_512( \
435 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
436 (__v32hf)_mm512_setzero_ph()))
437
438static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
439 __m512h __B) {
440 return (__m512h)((__v32hf)__A * (__v32hf)__B);
441}
442
443static __inline__ __m512h __DEFAULT_FN_ATTRS512
444_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
445 return (__m512h)__builtin_ia32_selectph_512(
446 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
447}
448
449static __inline__ __m512h __DEFAULT_FN_ATTRS512
450_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
451 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
452 (__v32hf)_mm512_mul_ph(__A, __B),
453 (__v32hf)_mm512_setzero_ph());
454}
455
456#define _mm512_mul_round_ph(A, B, R) \
457 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
458 (__v32hf)(__m512h)(B), (int)(R)))
459
460#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
461 ((__m512h)__builtin_ia32_selectph_512( \
462 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
463 (__v32hf)(__m512h)(W)))
464
465#define _mm512_maskz_mul_round_ph(U, A, B, R) \
466 ((__m512h)__builtin_ia32_selectph_512( \
467 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
468 (__v32hf)_mm512_setzero_ph()))
469
470static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
471 __m512h __B) {
472 return (__m512h)((__v32hf)__A / (__v32hf)__B);
473}
474
475static __inline__ __m512h __DEFAULT_FN_ATTRS512
476_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
477 return (__m512h)__builtin_ia32_selectph_512(
478 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
479}
480
481static __inline__ __m512h __DEFAULT_FN_ATTRS512
482_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
483 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
484 (__v32hf)_mm512_div_ph(__A, __B),
485 (__v32hf)_mm512_setzero_ph());
486}
487
488#define _mm512_div_round_ph(A, B, R) \
489 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
490 (__v32hf)(__m512h)(B), (int)(R)))
491
492#define _mm512_mask_div_round_ph(W, U, A, B, R) \
493 ((__m512h)__builtin_ia32_selectph_512( \
494 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
495 (__v32hf)(__m512h)(W)))
496
497#define _mm512_maskz_div_round_ph(U, A, B, R) \
498 ((__m512h)__builtin_ia32_selectph_512( \
499 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
500 (__v32hf)_mm512_setzero_ph()))
501
502static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
503 __m512h __B) {
504 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
506}
507
508static __inline__ __m512h __DEFAULT_FN_ATTRS512
509_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
510 return (__m512h)__builtin_ia32_selectph_512(
511 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
512}
513
514static __inline__ __m512h __DEFAULT_FN_ATTRS512
515_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
516 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
517 (__v32hf)_mm512_min_ph(__A, __B),
518 (__v32hf)_mm512_setzero_ph());
519}
520
521#define _mm512_min_round_ph(A, B, R) \
522 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
523 (__v32hf)(__m512h)(B), (int)(R)))
524
525#define _mm512_mask_min_round_ph(W, U, A, B, R) \
526 ((__m512h)__builtin_ia32_selectph_512( \
527 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
528 (__v32hf)(__m512h)(W)))
529
530#define _mm512_maskz_min_round_ph(U, A, B, R) \
531 ((__m512h)__builtin_ia32_selectph_512( \
532 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
533 (__v32hf)_mm512_setzero_ph()))
534
535static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
536 __m512h __B) {
537 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
539}
540
541static __inline__ __m512h __DEFAULT_FN_ATTRS512
542_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
543 return (__m512h)__builtin_ia32_selectph_512(
544 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
545}
546
547static __inline__ __m512h __DEFAULT_FN_ATTRS512
548_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
549 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
550 (__v32hf)_mm512_max_ph(__A, __B),
551 (__v32hf)_mm512_setzero_ph());
552}
553
554#define _mm512_max_round_ph(A, B, R) \
555 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
556 (__v32hf)(__m512h)(B), (int)(R)))
557
558#define _mm512_mask_max_round_ph(W, U, A, B, R) \
559 ((__m512h)__builtin_ia32_selectph_512( \
560 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
561 (__v32hf)(__m512h)(W)))
562
563#define _mm512_maskz_max_round_ph(U, A, B, R) \
564 ((__m512h)__builtin_ia32_selectph_512( \
565 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
566 (__v32hf)_mm512_setzero_ph()))
567
568static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
569_mm512_abs_ph(__m512h __A) {
570 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
571}
572
573static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
574 return (__m512h)_mm512_xor_epi32((__m512i)__A,
575 _mm512_set1_epi32(-2147483648));
576}
577
578static __inline__ __m512h __DEFAULT_FN_ATTRS512
579_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
580 return (__m512h)__builtin_ia32_selectps_512(
581 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
582}
583
584static __inline__ __m512h __DEFAULT_FN_ATTRS512
585_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
586 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
587 (__v16sf)_mm512_conj_pch(__A),
588 (__v16sf)_mm512_setzero_ps());
589}
590
591static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
592_mm_add_sh(__m128h __A, __m128h __B) {
593 __A[0] += __B[0];
594 return __A;
595}
596
597static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
598_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
599 __A = _mm_add_sh(__A, __B);
600 return __builtin_ia32_selectsh_128(__U, __A, __W);
601}
602
603static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
604_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) {
605 __A = _mm_add_sh(__A, __B);
606 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
607}
608
609#define _mm_add_round_sh(A, B, R) \
610 ((__m128h)__builtin_ia32_addsh_round_mask( \
611 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
612 (__mmask8)-1, (int)(R)))
613
614#define _mm_mask_add_round_sh(W, U, A, B, R) \
615 ((__m128h)__builtin_ia32_addsh_round_mask( \
616 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
617 (__mmask8)(U), (int)(R)))
618
619#define _mm_maskz_add_round_sh(U, A, B, R) \
620 ((__m128h)__builtin_ia32_addsh_round_mask( \
621 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
622 (__mmask8)(U), (int)(R)))
623
624static __inline__ __m128h
625 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sub_sh(__m128h __A, __m128h __B) {
626 __A[0] -= __B[0];
627 return __A;
628}
629
630static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
631_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
632 __A = _mm_sub_sh(__A, __B);
633 return __builtin_ia32_selectsh_128(__U, __A, __W);
634}
635
636static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
637_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) {
638 __A = _mm_sub_sh(__A, __B);
639 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
640}
641
642#define _mm_sub_round_sh(A, B, R) \
643 ((__m128h)__builtin_ia32_subsh_round_mask( \
644 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
645 (__mmask8)-1, (int)(R)))
646
647#define _mm_mask_sub_round_sh(W, U, A, B, R) \
648 ((__m128h)__builtin_ia32_subsh_round_mask( \
649 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
650 (__mmask8)(U), (int)(R)))
651
652#define _mm_maskz_sub_round_sh(U, A, B, R) \
653 ((__m128h)__builtin_ia32_subsh_round_mask( \
654 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
655 (__mmask8)(U), (int)(R)))
656
657static __inline__ __m128h
658 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mul_sh(__m128h __A, __m128h __B) {
659 __A[0] *= __B[0];
660 return __A;
661}
662
663static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
664_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
665 __A = _mm_mul_sh(__A, __B);
666 return __builtin_ia32_selectsh_128(__U, __A, __W);
667}
668
669static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
670_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) {
671 __A = _mm_mul_sh(__A, __B);
672 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
673}
674
675#define _mm_mul_round_sh(A, B, R) \
676 ((__m128h)__builtin_ia32_mulsh_round_mask( \
677 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
678 (__mmask8)-1, (int)(R)))
679
680#define _mm_mask_mul_round_sh(W, U, A, B, R) \
681 ((__m128h)__builtin_ia32_mulsh_round_mask( \
682 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
683 (__mmask8)(U), (int)(R)))
684
685#define _mm_maskz_mul_round_sh(U, A, B, R) \
686 ((__m128h)__builtin_ia32_mulsh_round_mask( \
687 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
688 (__mmask8)(U), (int)(R)))
689
690static __inline__ __m128h
691 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_div_sh(__m128h __A, __m128h __B) {
692 __A[0] /= __B[0];
693 return __A;
694}
695
696static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
697_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
698 __A = _mm_div_sh(__A, __B);
699 return __builtin_ia32_selectsh_128(__U, __A, __W);
700}
701
702static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
703_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) {
704 __A = _mm_div_sh(__A, __B);
705 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
706}
707
708#define _mm_div_round_sh(A, B, R) \
709 ((__m128h)__builtin_ia32_divsh_round_mask( \
710 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
711 (__mmask8)-1, (int)(R)))
712
713#define _mm_mask_div_round_sh(W, U, A, B, R) \
714 ((__m128h)__builtin_ia32_divsh_round_mask( \
715 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
716 (__mmask8)(U), (int)(R)))
717
718#define _mm_maskz_div_round_sh(U, A, B, R) \
719 ((__m128h)__builtin_ia32_divsh_round_mask( \
720 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
721 (__mmask8)(U), (int)(R)))
722
723static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
724 __m128h __B) {
725 return (__m128h)__builtin_ia32_minsh_round_mask(
726 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
728}
729
730static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
731 __mmask8 __U,
732 __m128h __A,
733 __m128h __B) {
734 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
735 (__v8hf)__W, (__mmask8)__U,
737}
738
739static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
740 __m128h __A,
741 __m128h __B) {
742 return (__m128h)__builtin_ia32_minsh_round_mask(
743 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
745}
746
747#define _mm_min_round_sh(A, B, R) \
748 ((__m128h)__builtin_ia32_minsh_round_mask( \
749 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
750 (__mmask8)-1, (int)(R)))
751
752#define _mm_mask_min_round_sh(W, U, A, B, R) \
753 ((__m128h)__builtin_ia32_minsh_round_mask( \
754 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
755 (__mmask8)(U), (int)(R)))
756
757#define _mm_maskz_min_round_sh(U, A, B, R) \
758 ((__m128h)__builtin_ia32_minsh_round_mask( \
759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
760 (__mmask8)(U), (int)(R)))
761
762static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
763 __m128h __B) {
764 return (__m128h)__builtin_ia32_maxsh_round_mask(
765 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
767}
768
769static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
770 __mmask8 __U,
771 __m128h __A,
772 __m128h __B) {
773 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
774 (__v8hf)__W, (__mmask8)__U,
776}
777
778static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
779 __m128h __A,
780 __m128h __B) {
781 return (__m128h)__builtin_ia32_maxsh_round_mask(
782 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
784}
785
786#define _mm_max_round_sh(A, B, R) \
787 ((__m128h)__builtin_ia32_maxsh_round_mask( \
788 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
789 (__mmask8)-1, (int)(R)))
790
791#define _mm_mask_max_round_sh(W, U, A, B, R) \
792 ((__m128h)__builtin_ia32_maxsh_round_mask( \
793 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
794 (__mmask8)(U), (int)(R)))
795
796#define _mm_maskz_max_round_sh(U, A, B, R) \
797 ((__m128h)__builtin_ia32_maxsh_round_mask( \
798 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
799 (__mmask8)(U), (int)(R)))
800
801#define _mm512_cmp_round_ph_mask(A, B, P, R) \
802 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
803 (__v32hf)(__m512h)(B), (int)(P), \
804 (__mmask32)-1, (int)(R)))
805
806#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
807 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
808 (__v32hf)(__m512h)(B), (int)(P), \
809 (__mmask32)(U), (int)(R)))
810
811#define _mm512_cmp_ph_mask(A, B, P) \
812 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
813
814#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
815 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
816
817#define _mm_cmp_round_sh_mask(X, Y, P, R) \
818 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
819 (__v8hf)(__m128h)(Y), (int)(P), \
820 (__mmask8)-1, (int)(R)))
821
822#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
823 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
824 (__v8hf)(__m128h)(Y), (int)(P), \
825 (__mmask8)(M), (int)(R)))
826
827#define _mm_cmp_sh_mask(X, Y, P) \
828 ((__mmask8)__builtin_ia32_cmpsh_mask( \
829 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
830 _MM_FROUND_CUR_DIRECTION))
831
832#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
833 ((__mmask8)__builtin_ia32_cmpsh_mask( \
834 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
835 _MM_FROUND_CUR_DIRECTION))
836// loads with vmovsh:
837static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
838 struct __mm_load_sh_struct {
839 _Float16 __u;
840 } __attribute__((__packed__, __may_alias__));
841 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
842 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
843}
844
845static __inline__ __m128h __DEFAULT_FN_ATTRS128
846_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
847 __m128h src = (__v8hf)__builtin_shufflevector(
848 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
849
850 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
851}
852
853static __inline__ __m128h __DEFAULT_FN_ATTRS128
854_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
855 return (__m128h)__builtin_ia32_loadsh128_mask(
856 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
857}
858
859static __inline__ __m512h __DEFAULT_FN_ATTRS512
860_mm512_load_ph(void const *__p) {
861 return *(const __m512h *)__p;
862}
863
864static __inline__ __m256h __DEFAULT_FN_ATTRS256
865_mm256_load_ph(void const *__p) {
866 return *(const __m256h *)__p;
867}
868
869static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
870 return *(const __m128h *)__p;
871}
872
873static __inline__ __m512h __DEFAULT_FN_ATTRS512
874_mm512_loadu_ph(void const *__p) {
875 struct __loadu_ph {
876 __m512h_u __v;
877 } __attribute__((__packed__, __may_alias__));
878 return ((const struct __loadu_ph *)__p)->__v;
879}
880
881static __inline__ __m256h __DEFAULT_FN_ATTRS256
882_mm256_loadu_ph(void const *__p) {
883 struct __loadu_ph {
884 __m256h_u __v;
885 } __attribute__((__packed__, __may_alias__));
886 return ((const struct __loadu_ph *)__p)->__v;
887}
888
889static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
890 struct __loadu_ph {
891 __m128h_u __v;
892 } __attribute__((__packed__, __may_alias__));
893 return ((const struct __loadu_ph *)__p)->__v;
894}
895
896// stores with vmovsh:
897static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
898 __m128h __a) {
899 struct __mm_store_sh_struct {
900 _Float16 __u;
901 } __attribute__((__packed__, __may_alias__));
902 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
903}
904
905static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
906 __mmask8 __U,
907 __m128h __A) {
908 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
909}
910
911static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
912 __m512h __A) {
913 *(__m512h *)__P = __A;
914}
915
916static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
917 __m256h __A) {
918 *(__m256h *)__P = __A;
919}
920
921static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
922 __m128h __A) {
923 *(__m128h *)__P = __A;
924}
925
926static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
927 __m512h __A) {
928 struct __storeu_ph {
929 __m512h_u __v;
930 } __attribute__((__packed__, __may_alias__));
931 ((struct __storeu_ph *)__P)->__v = __A;
932}
933
934static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
935 __m256h __A) {
936 struct __storeu_ph {
937 __m256h_u __v;
938 } __attribute__((__packed__, __may_alias__));
939 ((struct __storeu_ph *)__P)->__v = __A;
940}
941
942static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
943 __m128h __A) {
944 struct __storeu_ph {
945 __m128h_u __v;
946 } __attribute__((__packed__, __may_alias__));
947 ((struct __storeu_ph *)__P)->__v = __A;
948}
949
950// moves with vmovsh:
951static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
952_mm_move_sh(__m128h __a, __m128h __b) {
953 __a[0] = __b[0];
954 return __a;
955}
956
957static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
958_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
959 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
960}
961
962static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
963_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
964 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
965 _mm_setzero_ph());
966}
967
968// vmovw:
969static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
970 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
971}
972
973static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
974 __v8hi __b = (__v8hi)__a;
975 return __b[0];
976}
977
978static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
979 return (__m512h)__builtin_ia32_rcpph512_mask(
980 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
981}
982
983static __inline__ __m512h __DEFAULT_FN_ATTRS512
984_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
985 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
986 (__mmask32)__U);
987}
988
989static __inline__ __m512h __DEFAULT_FN_ATTRS512
990_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
991 return (__m512h)__builtin_ia32_rcpph512_mask(
992 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
993}
994
995static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
996 return (__m512h)__builtin_ia32_rsqrtph512_mask(
997 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
998}
999
1000static __inline__ __m512h __DEFAULT_FN_ATTRS512
1001_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1002 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1003 (__mmask32)__U);
1004}
1005
1006static __inline__ __m512h __DEFAULT_FN_ATTRS512
1007_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1008 return (__m512h)__builtin_ia32_rsqrtph512_mask(
1009 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1010}
1011
1012#define _mm512_getmant_ph(A, B, C) \
1013 ((__m512h)__builtin_ia32_getmantph512_mask( \
1014 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1015 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1016 _MM_FROUND_CUR_DIRECTION))
1017
1018#define _mm512_mask_getmant_ph(W, U, A, B, C) \
1019 ((__m512h)__builtin_ia32_getmantph512_mask( \
1020 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1021 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1022
1023#define _mm512_maskz_getmant_ph(U, A, B, C) \
1024 ((__m512h)__builtin_ia32_getmantph512_mask( \
1025 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1026 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1027
1028#define _mm512_getmant_round_ph(A, B, C, R) \
1029 ((__m512h)__builtin_ia32_getmantph512_mask( \
1030 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1031 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1032
1033#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1034 ((__m512h)__builtin_ia32_getmantph512_mask( \
1035 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1036 (__mmask32)(U), (int)(R)))
1037
1038#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1039 ((__m512h)__builtin_ia32_getmantph512_mask( \
1040 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1041 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1042
1043static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1044 return (__m512h)__builtin_ia32_getexpph512_mask(
1045 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1047}
1048
1049static __inline__ __m512h __DEFAULT_FN_ATTRS512
1050_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1051 return (__m512h)__builtin_ia32_getexpph512_mask(
1052 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1053}
1054
1055static __inline__ __m512h __DEFAULT_FN_ATTRS512
1056_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1057 return (__m512h)__builtin_ia32_getexpph512_mask(
1058 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1060}
1061
1062#define _mm512_getexp_round_ph(A, R) \
1063 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1064 (__v32hf)_mm512_undefined_ph(), \
1065 (__mmask32)-1, (int)(R)))
1066
1067#define _mm512_mask_getexp_round_ph(W, U, A, R) \
1068 ((__m512h)__builtin_ia32_getexpph512_mask( \
1069 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1070
1071#define _mm512_maskz_getexp_round_ph(U, A, R) \
1072 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1073 (__v32hf)_mm512_setzero_ph(), \
1074 (__mmask32)(U), (int)(R)))
1075
1076static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1077 __m512h __B) {
1078 return (__m512h)__builtin_ia32_scalefph512_mask(
1079 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1081}
1082
1083static __inline__ __m512h __DEFAULT_FN_ATTRS512
1084_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1085 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1086 (__v32hf)__W, (__mmask32)__U,
1088}
1089
1090static __inline__ __m512h __DEFAULT_FN_ATTRS512
1091_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1092 return (__m512h)__builtin_ia32_scalefph512_mask(
1093 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1095}
1096
1097#define _mm512_scalef_round_ph(A, B, R) \
1098 ((__m512h)__builtin_ia32_scalefph512_mask( \
1099 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1100 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1101
1102#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1103 ((__m512h)__builtin_ia32_scalefph512_mask( \
1104 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1105 (__mmask32)(U), (int)(R)))
1106
1107#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1108 ((__m512h)__builtin_ia32_scalefph512_mask( \
1109 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1110 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1111
1112#define _mm512_roundscale_ph(A, B) \
1113 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1114 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1115 _MM_FROUND_CUR_DIRECTION))
1116
1117#define _mm512_mask_roundscale_ph(A, B, C, imm) \
1118 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1119 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1120 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1121
1122#define _mm512_maskz_roundscale_ph(A, B, imm) \
1123 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1124 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1125 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1126
1127#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1129 (__v32hf)(__m512h)(A), \
1130 (__mmask32)(B), (int)(R)))
1131
1132#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1133 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1134 (__v32hf)_mm512_setzero_ph(), \
1135 (__mmask32)(A), (int)(R)))
1136
1137#define _mm512_roundscale_round_ph(A, imm, R) \
1138 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1139 (__v32hf)_mm512_undefined_ph(), \
1140 (__mmask32)-1, (int)(R)))
1141
1142#define _mm512_reduce_ph(A, imm) \
1143 ((__m512h)__builtin_ia32_reduceph512_mask( \
1144 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1145 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1146
1147#define _mm512_mask_reduce_ph(W, U, A, imm) \
1148 ((__m512h)__builtin_ia32_reduceph512_mask( \
1149 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1150 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1151
1152#define _mm512_maskz_reduce_ph(U, A, imm) \
1153 ((__m512h)__builtin_ia32_reduceph512_mask( \
1154 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1155 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1156
1157#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159 (__v32hf)(__m512h)(W), \
1160 (__mmask32)(U), (int)(R)))
1161
1162#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1163 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1164 (__v32hf)_mm512_setzero_ph(), \
1165 (__mmask32)(U), (int)(R)))
1166
1167#define _mm512_reduce_round_ph(A, imm, R) \
1168 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1169 (__v32hf)_mm512_undefined_ph(), \
1170 (__mmask32)-1, (int)(R)))
1171
1172static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1173 __m128h __B) {
1174 return (__m128h)__builtin_ia32_rcpsh_mask(
1175 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1176}
1177
1178static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1179 __mmask8 __U,
1180 __m128h __A,
1181 __m128h __B) {
1182 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1183 (__v8hf)__W, (__mmask8)__U);
1184}
1185
1186static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1187 __m128h __A,
1188 __m128h __B) {
1189 return (__m128h)__builtin_ia32_rcpsh_mask(
1190 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1191}
1192
1193static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1194 __m128h __B) {
1195 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1196 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1197}
1198
1199static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1200 __mmask8 __U,
1201 __m128h __A,
1202 __m128h __B) {
1203 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1204 (__v8hf)__W, (__mmask8)__U);
1205}
1206
1207static __inline__ __m128h __DEFAULT_FN_ATTRS128
1208_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1209 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1210 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1211}
1212
1213#define _mm_getmant_round_sh(A, B, C, D, R) \
1214 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1215 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1216 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1217
1218#define _mm_getmant_sh(A, B, C, D) \
1219 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1220 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1221 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1222
1223#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1224 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1225 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1226 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1227
1228#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1232
1233#define _mm_maskz_getmant_sh(U, A, B, C, D) \
1234 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1236 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1237
1238#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1239 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1240 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1241 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1242
1243#define _mm_getexp_round_sh(A, B, R) \
1244 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1245 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1246 (__mmask8)-1, (int)(R)))
1247
1248static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1249 __m128h __B) {
1250 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1251 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1253}
1254
1255static __inline__ __m128h __DEFAULT_FN_ATTRS128
1256_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1257 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1258 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1260}
1261
1262#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1263 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1264 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1265 (__mmask8)(U), (int)(R)))
1266
1267static __inline__ __m128h __DEFAULT_FN_ATTRS128
1268_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1269 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1270 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1272}
1273
1274#define _mm_maskz_getexp_round_sh(U, A, B, R) \
1275 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1276 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1277 (__mmask8)(U), (int)(R)))
1278
1279#define _mm_scalef_round_sh(A, B, R) \
1280 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1281 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1282 (__mmask8)-1, (int)(R)))
1283
1284static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1285 __m128h __B) {
1286 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1287 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1289}
1290
1291static __inline__ __m128h __DEFAULT_FN_ATTRS128
1292_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1293 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1294 (__v8hf)__W, (__mmask8)__U,
1296}
1297
1298#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1299 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1300 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1301 (__mmask8)(U), (int)(R)))
1302
1303static __inline__ __m128h __DEFAULT_FN_ATTRS128
1304_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1305 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1306 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1308}
1309
1310#define _mm_maskz_scalef_round_sh(U, A, B, R) \
1311 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1312 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1313 (__mmask8)(U), (int)(R)))
1314
1315#define _mm_roundscale_round_sh(A, B, imm, R) \
1316 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1317 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1318 (__mmask8)-1, (int)(imm), (int)(R)))
1319
1320#define _mm_roundscale_sh(A, B, imm) \
1321 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1322 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1323 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1324
1325#define _mm_mask_roundscale_sh(W, U, A, B, I) \
1326 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1328 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1329
1330#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1333 (__mmask8)(U), (int)(I), (int)(R)))
1334
1335#define _mm_maskz_roundscale_sh(U, A, B, I) \
1336 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1339
1340#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1341 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1343 (__mmask8)(U), (int)(I), (int)(R)))
1344
1345#define _mm_reduce_sh(A, B, C) \
1346 ((__m128h)__builtin_ia32_reducesh_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1348 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1349
1350#define _mm_mask_reduce_sh(W, U, A, B, C) \
1351 ((__m128h)__builtin_ia32_reducesh_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1353 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1354
1355#define _mm_maskz_reduce_sh(U, A, B, C) \
1356 ((__m128h)__builtin_ia32_reducesh_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1358 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1359
1360#define _mm_reduce_round_sh(A, B, C, R) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)-1, (int)(C), (int)(R)))
1364
1365#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1366 ((__m128h)__builtin_ia32_reducesh_mask( \
1367 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1368 (__mmask8)(U), (int)(C), (int)(R)))
1369
1370#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1371 ((__m128h)__builtin_ia32_reducesh_mask( \
1372 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1373 (__mmask8)(U), (int)(C), (int)(R)))
1374
1375#define _mm512_sqrt_round_ph(A, R) \
1376 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1377
1378#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1379 ((__m512h)__builtin_ia32_selectph_512( \
1380 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1381 (__v32hf)(__m512h)(W)))
1382
1383#define _mm512_maskz_sqrt_round_ph(U, A, R) \
1384 ((__m512h)__builtin_ia32_selectph_512( \
1385 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1386 (__v32hf)_mm512_setzero_ph()))
1387
1388static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1389 return (__m512h)__builtin_elementwise_sqrt((__v32hf)__A);
1390}
1391
1392static __inline__ __m512h __DEFAULT_FN_ATTRS512
1393_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1394 return (__m512h)__builtin_ia32_selectph_512(
1395 (__mmask32)(__U), (__v32hf)_mm512_sqrt_ph(__A), (__v32hf)(__m512h)(__W));
1396}
1397
1398static __inline__ __m512h __DEFAULT_FN_ATTRS512
1399_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1400 return (__m512h)__builtin_ia32_selectph_512((__mmask32)(__U),
1401 (__v32hf)_mm512_sqrt_ph(__A),
1402 (__v32hf)_mm512_setzero_ph());
1403}
1404
1405#define _mm_sqrt_round_sh(A, B, R) \
1406 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1407 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1408 (__mmask8)-1, (int)(R)))
1409
1410#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1411 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1412 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1413 (__mmask8)(U), (int)(R)))
1414
1415#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1416 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1417 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1418 (__mmask8)(U), (int)(R)))
1419
1420static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1421 __m128h __B) {
1422 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1423 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425}
1426
1427static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1428 __mmask32 __U,
1429 __m128h __A,
1430 __m128h __B) {
1431 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1432 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1434}
1435
1436static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1437 __m128h __A,
1438 __m128h __B) {
1439 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1440 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1442}
1443
1444#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1445 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1446 (int)(imm), (__mmask32)(U)))
1447
1448#define _mm512_fpclass_ph_mask(A, imm) \
1449 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1450 (int)(imm), (__mmask32)-1))
1451
1452#define _mm_fpclass_sh_mask(A, imm) \
1453 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1454 (__mmask8)-1))
1455
1456#define _mm_mask_fpclass_sh_mask(U, A, imm) \
1457 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1458 (__mmask8)(U)))
1459
1460#define _mm512_cvt_roundpd_ph(A, R) \
1461 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1462 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1463
1464#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1465 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1466 (__mmask8)(U), (int)(R)))
1467
1468#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1469 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1470 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1471
1472static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1473 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1474 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1476}
1477
1478static __inline__ __m128h __DEFAULT_FN_ATTRS512
1479_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1482}
1483
1484static __inline__ __m128h __DEFAULT_FN_ATTRS512
1485_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1486 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1487 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1489}
1490
1491#define _mm512_cvt_roundph_pd(A, R) \
1492 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1493 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1494
1495#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1496 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1497 (__mmask8)(U), (int)(R)))
1498
1499#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1500 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1501 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1502
1503static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1504 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1505 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1507}
1508
1509static __inline__ __m512d __DEFAULT_FN_ATTRS512
1510_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1513}
1514
1515static __inline__ __m512d __DEFAULT_FN_ATTRS512
1516_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1517 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1518 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1520}
1521
1522#define _mm_cvt_roundsh_ss(A, B, R) \
1523 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1524 (__v4sf)_mm_undefined_ps(), \
1525 (__mmask8)(-1), (int)(R)))
1526
1527#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1528 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1529 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1530
1531#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1532 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1533 (__v4sf)_mm_setzero_ps(), \
1534 (__mmask8)(U), (int)(R)))
1535
1536static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1537 __m128h __B) {
1538 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1539 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1541}
1542
1543static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1544 __mmask8 __U,
1545 __m128 __A,
1546 __m128h __B) {
1547 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1548 (__v4sf)__W, (__mmask8)__U,
1550}
1551
1552static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1553 __m128 __A,
1554 __m128h __B) {
1555 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1556 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1558}
1559
1560#define _mm_cvt_roundss_sh(A, B, R) \
1561 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1562 (__v8hf)_mm_undefined_ph(), \
1563 (__mmask8)(-1), (int)(R)))
1564
1565#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1566 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1567 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1568
1569#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1570 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1571 (__v8hf)_mm_setzero_ph(), \
1572 (__mmask8)(U), (int)(R)))
1573
1574static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1575 __m128 __B) {
1576 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1577 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1579}
1580
1581static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1582 __mmask8 __U,
1583 __m128h __A,
1584 __m128 __B) {
1585 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1586 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1588}
1589
1590static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1591 __m128h __A,
1592 __m128 __B) {
1593 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1594 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1596}
1597
1598#define _mm_cvt_roundsd_sh(A, B, R) \
1599 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1600 (__v8hf)_mm_undefined_ph(), \
1601 (__mmask8)(-1), (int)(R)))
1602
1603#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1604 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1605 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1606
1607#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1608 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1609 (__v8hf)_mm_setzero_ph(), \
1610 (__mmask8)(U), (int)(R)))
1611
1612static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1613 __m128d __B) {
1614 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1615 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1617}
1618
1619static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1620 __mmask8 __U,
1621 __m128h __A,
1622 __m128d __B) {
1623 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1624 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1626}
1627
1628static __inline__ __m128h __DEFAULT_FN_ATTRS128
1629_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1630 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1631 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1633}
1634
1635#define _mm_cvt_roundsh_sd(A, B, R) \
1636 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1637 (__v2df)_mm_undefined_pd(), \
1638 (__mmask8)(-1), (int)(R)))
1639
1640#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1641 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1642 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1643
1644#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1645 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1646 (__v2df)_mm_setzero_pd(), \
1647 (__mmask8)(U), (int)(R)))
1648
1649static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1650 __m128h __B) {
1651 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1652 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1654}
1655
1656static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1657 __mmask8 __U,
1658 __m128d __A,
1659 __m128h __B) {
1660 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1661 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1663}
1664
1665static __inline__ __m128d __DEFAULT_FN_ATTRS128
1666_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1667 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1668 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1670}
1671
1672#define _mm512_cvt_roundph_epi16(A, R) \
1673 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1674 (__v32hi)_mm512_undefined_epi32(), \
1675 (__mmask32)(-1), (int)(R)))
1676
1677#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1678 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1679 (__mmask32)(U), (int)(R)))
1680
1681#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1682 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1683 (__v32hi)_mm512_setzero_epi32(), \
1684 (__mmask32)(U), (int)(R)))
1685
1686static __inline__ __m512i __DEFAULT_FN_ATTRS512
1687_mm512_cvtph_epi16(__m512h __A) {
1688 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1689 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1691}
1692
1693static __inline__ __m512i __DEFAULT_FN_ATTRS512
1694_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1697}
1698
1699static __inline__ __m512i __DEFAULT_FN_ATTRS512
1700_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1701 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1702 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1704}
1705
1706#define _mm512_cvtt_roundph_epi16(A, R) \
1707 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1708 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1709 (int)(R)))
1710
1711#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1712 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1713 (__mmask32)(U), (int)(R)))
1714
1715#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1716 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1717 (__v32hi)_mm512_setzero_epi32(), \
1718 (__mmask32)(U), (int)(R)))
1719
1720static __inline__ __m512i __DEFAULT_FN_ATTRS512
1721_mm512_cvttph_epi16(__m512h __A) {
1722 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1723 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1725}
1726
1727static __inline__ __m512i __DEFAULT_FN_ATTRS512
1728_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1731}
1732
1733static __inline__ __m512i __DEFAULT_FN_ATTRS512
1734_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1735 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1736 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1738}
1739
1740#define _mm512_cvt_roundepi16_ph(A, R) \
1741 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1742 (__v32hf)_mm512_undefined_ph(), \
1743 (__mmask32)(-1), (int)(R)))
1744
1745#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1746 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1747 (__mmask32)(U), (int)(R)))
1748
1749#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1750 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1751 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1752
1753static __inline__ __m512h __DEFAULT_FN_ATTRS512
1754_mm512_cvtepi16_ph(__m512i __A) {
1755 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1756 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1758}
1759
1760static __inline__ __m512h __DEFAULT_FN_ATTRS512
1761_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1764}
1765
1766static __inline__ __m512h __DEFAULT_FN_ATTRS512
1767_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1768 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1769 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1771}
1772
1773#define _mm512_cvt_roundph_epu16(A, R) \
1774 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1775 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1776 (int)(R)))
1777
1778#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1779 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1780 (__mmask32)(U), (int)(R)))
1781
1782#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1783 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1784 (__v32hu)_mm512_setzero_epi32(), \
1785 (__mmask32)(U), (int)(R)))
1786
1787static __inline__ __m512i __DEFAULT_FN_ATTRS512
1788_mm512_cvtph_epu16(__m512h __A) {
1789 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1790 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1792}
1793
1794static __inline__ __m512i __DEFAULT_FN_ATTRS512
1795_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1798}
1799
1800static __inline__ __m512i __DEFAULT_FN_ATTRS512
1801_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1802 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1803 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1805}
1806
1807#define _mm512_cvtt_roundph_epu16(A, R) \
1808 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1809 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1810 (int)(R)))
1811
1812#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1813 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1814 (__mmask32)(U), (int)(R)))
1815
1816#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1817 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1818 (__v32hu)_mm512_setzero_epi32(), \
1819 (__mmask32)(U), (int)(R)))
1820
1821static __inline__ __m512i __DEFAULT_FN_ATTRS512
1822_mm512_cvttph_epu16(__m512h __A) {
1823 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1824 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1826}
1827
1828static __inline__ __m512i __DEFAULT_FN_ATTRS512
1829_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1832}
1833
1834static __inline__ __m512i __DEFAULT_FN_ATTRS512
1835_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1836 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1837 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1839}
1840
1841#define _mm512_cvt_roundepu16_ph(A, R) \
1842 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1843 (__v32hf)_mm512_undefined_ph(), \
1844 (__mmask32)(-1), (int)(R)))
1845
1846#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1847 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1848 (__mmask32)(U), (int)(R)))
1849
1850#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1851 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1852 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1853
1854static __inline__ __m512h __DEFAULT_FN_ATTRS512
1855_mm512_cvtepu16_ph(__m512i __A) {
1856 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1857 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1859}
1860
1861static __inline__ __m512h __DEFAULT_FN_ATTRS512
1862_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1865}
1866
1867static __inline__ __m512h __DEFAULT_FN_ATTRS512
1868_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1869 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1870 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1872}
1873
1874#define _mm512_cvt_roundph_epi32(A, R) \
1875 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1876 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1877 (int)(R)))
1878
1879#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1880 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1881 (__mmask16)(U), (int)(R)))
1882
1883#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1884 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1885 (__v16si)_mm512_setzero_epi32(), \
1886 (__mmask16)(U), (int)(R)))
1887
1888static __inline__ __m512i __DEFAULT_FN_ATTRS512
1889_mm512_cvtph_epi32(__m256h __A) {
1890 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1891 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1893}
1894
1895static __inline__ __m512i __DEFAULT_FN_ATTRS512
1896_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1899}
1900
1901static __inline__ __m512i __DEFAULT_FN_ATTRS512
1902_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1903 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1904 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1906}
1907
1908#define _mm512_cvt_roundph_epu32(A, R) \
1909 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1910 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1911 (int)(R)))
1912
1913#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1914 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1915 (__mmask16)(U), (int)(R)))
1916
1917#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1918 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1919 (__v16su)_mm512_setzero_epi32(), \
1920 (__mmask16)(U), (int)(R)))
1921
1922static __inline__ __m512i __DEFAULT_FN_ATTRS512
1923_mm512_cvtph_epu32(__m256h __A) {
1924 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1925 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1927}
1928
1929static __inline__ __m512i __DEFAULT_FN_ATTRS512
1930_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1933}
1934
1935static __inline__ __m512i __DEFAULT_FN_ATTRS512
1936_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1937 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1938 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1940}
1941
1942#define _mm512_cvt_roundepi32_ph(A, R) \
1943 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1944 (__v16hf)_mm256_undefined_ph(), \
1945 (__mmask16)(-1), (int)(R)))
1946
1947#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1948 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1949 (__mmask16)(U), (int)(R)))
1950
1951#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1952 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1953 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1954
1955static __inline__ __m256h __DEFAULT_FN_ATTRS512
1956_mm512_cvtepi32_ph(__m512i __A) {
1957 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1958 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1960}
1961
1962static __inline__ __m256h __DEFAULT_FN_ATTRS512
1963_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1966}
1967
1968static __inline__ __m256h __DEFAULT_FN_ATTRS512
1969_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1970 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1971 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1973}
1974
1975#define _mm512_cvt_roundepu32_ph(A, R) \
1976 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1977 (__v16hf)_mm256_undefined_ph(), \
1978 (__mmask16)(-1), (int)(R)))
1979
1980#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1981 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1982 (__mmask16)(U), (int)(R)))
1983
1984#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1985 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1986 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1987
1988static __inline__ __m256h __DEFAULT_FN_ATTRS512
1989_mm512_cvtepu32_ph(__m512i __A) {
1990 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1991 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1993}
1994
1995static __inline__ __m256h __DEFAULT_FN_ATTRS512
1996_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1999}
2000
2001static __inline__ __m256h __DEFAULT_FN_ATTRS512
2002_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2003 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2004 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2006}
2007
2008#define _mm512_cvtt_roundph_epi32(A, R) \
2009 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2010 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2011 (int)(R)))
2012
2013#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2014 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2015 (__mmask16)(U), (int)(R)))
2016
2017#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2018 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2019 (__v16si)_mm512_setzero_epi32(), \
2020 (__mmask16)(U), (int)(R)))
2021
2022static __inline__ __m512i __DEFAULT_FN_ATTRS512
2023_mm512_cvttph_epi32(__m256h __A) {
2024 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2025 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2027}
2028
2029static __inline__ __m512i __DEFAULT_FN_ATTRS512
2030_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2033}
2034
2035static __inline__ __m512i __DEFAULT_FN_ATTRS512
2036_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2037 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2038 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2040}
2041
2042#define _mm512_cvtt_roundph_epu32(A, R) \
2043 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2044 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2045 (int)(R)))
2046
2047#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2048 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2049 (__mmask16)(U), (int)(R)))
2050
2051#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2052 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2053 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2054 (int)(R)))
2055
2056static __inline__ __m512i __DEFAULT_FN_ATTRS512
2057_mm512_cvttph_epu32(__m256h __A) {
2058 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2059 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2061}
2062
2063static __inline__ __m512i __DEFAULT_FN_ATTRS512
2064_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2067}
2068
2069static __inline__ __m512i __DEFAULT_FN_ATTRS512
2070_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2071 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2072 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2074}
2075
2076#define _mm512_cvt_roundepi64_ph(A, R) \
2077 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2078 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2079
2080#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2081 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2082 (__mmask8)(U), (int)(R)))
2083
2084#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2085 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2086 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2087
2088static __inline__ __m128h __DEFAULT_FN_ATTRS512
2089_mm512_cvtepi64_ph(__m512i __A) {
2090 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2091 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2093}
2094
2095static __inline__ __m128h __DEFAULT_FN_ATTRS512
2096_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2099}
2100
2101static __inline__ __m128h __DEFAULT_FN_ATTRS512
2102_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2103 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2104 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2106}
2107
2108#define _mm512_cvt_roundph_epi64(A, R) \
2109 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2110 (__v8di)_mm512_undefined_epi32(), \
2111 (__mmask8)(-1), (int)(R)))
2112
2113#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2114 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2115 (__mmask8)(U), (int)(R)))
2116
2117#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2118 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2119 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2120
2121static __inline__ __m512i __DEFAULT_FN_ATTRS512
2122_mm512_cvtph_epi64(__m128h __A) {
2123 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2124 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2126}
2127
2128static __inline__ __m512i __DEFAULT_FN_ATTRS512
2129_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2132}
2133
2134static __inline__ __m512i __DEFAULT_FN_ATTRS512
2135_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2136 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2137 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2139}
2140
2141#define _mm512_cvt_roundepu64_ph(A, R) \
2142 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2143 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2144
2145#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2146 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2147 (__mmask8)(U), (int)(R)))
2148
2149#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2150 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2151 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2152
2153static __inline__ __m128h __DEFAULT_FN_ATTRS512
2154_mm512_cvtepu64_ph(__m512i __A) {
2155 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2156 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2158}
2159
2160static __inline__ __m128h __DEFAULT_FN_ATTRS512
2161_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2164}
2165
2166static __inline__ __m128h __DEFAULT_FN_ATTRS512
2167_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2168 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2169 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2171}
2172
2173#define _mm512_cvt_roundph_epu64(A, R) \
2174 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2175 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2176 (int)(R)))
2177
2178#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2179 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2180 (__mmask8)(U), (int)(R)))
2181
2182#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2183 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2184 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2185
2186static __inline__ __m512i __DEFAULT_FN_ATTRS512
2187_mm512_cvtph_epu64(__m128h __A) {
2188 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2189 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2191}
2192
2193static __inline__ __m512i __DEFAULT_FN_ATTRS512
2194_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2197}
2198
2199static __inline__ __m512i __DEFAULT_FN_ATTRS512
2200_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2201 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2202 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2204}
2205
2206#define _mm512_cvtt_roundph_epi64(A, R) \
2207 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2208 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2209 (int)(R)))
2210
2211#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2212 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2213 (__mmask8)(U), (int)(R)))
2214
2215#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2216 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2217 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2218
2219static __inline__ __m512i __DEFAULT_FN_ATTRS512
2220_mm512_cvttph_epi64(__m128h __A) {
2221 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2222 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2224}
2225
2226static __inline__ __m512i __DEFAULT_FN_ATTRS512
2227_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2230}
2231
2232static __inline__ __m512i __DEFAULT_FN_ATTRS512
2233_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2234 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2235 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2237}
2238
2239#define _mm512_cvtt_roundph_epu64(A, R) \
2240 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2241 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2242 (int)(R)))
2243
2244#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2245 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2246 (__mmask8)(U), (int)(R)))
2247
2248#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2249 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2250 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2251
2252static __inline__ __m512i __DEFAULT_FN_ATTRS512
2253_mm512_cvttph_epu64(__m128h __A) {
2254 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2255 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2257}
2258
2259static __inline__ __m512i __DEFAULT_FN_ATTRS512
2260_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2263}
2264
2265static __inline__ __m512i __DEFAULT_FN_ATTRS512
2266_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2267 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2268 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2270}
2271
2272#define _mm_cvt_roundsh_i32(A, R) \
2273 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2274
2275static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2276 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2277}
2278
2279#define _mm_cvt_roundsh_u32(A, R) \
2280 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2281
2282static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2283_mm_cvtsh_u32(__m128h __A) {
2284 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2286}
2287
2288#ifdef __x86_64__
2289#define _mm_cvt_roundsh_i64(A, R) \
2290 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2291
2292static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2293 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295}
2296
2297#define _mm_cvt_roundsh_u64(A, R) \
2298 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2299
2300static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2301_mm_cvtsh_u64(__m128h __A) {
2302 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2303 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2304}
2305#endif // __x86_64__
2306
2307#define _mm_cvt_roundu32_sh(A, B, R) \
2308 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2309
2310static __inline__ __m128h __DEFAULT_FN_ATTRS128
2311_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2312 __A[0] = __B;
2313 return __A;
2314}
2315
2316#ifdef __x86_64__
2317#define _mm_cvt_roundu64_sh(A, B, R) \
2318 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2319 (int)(R)))
2320
2321static __inline__ __m128h __DEFAULT_FN_ATTRS128
2322_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2323 __A[0] = __B;
2324 return __A;
2325}
2326#endif
2327
2328#define _mm_cvt_roundi32_sh(A, B, R) \
2329 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2330
2331static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2332 int __B) {
2333 __A[0] = __B;
2334 return __A;
2335}
2336
2337#ifdef __x86_64__
2338#define _mm_cvt_roundi64_sh(A, B, R) \
2339 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2340
2341static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2342 long long __B) {
2343 __A[0] = __B;
2344 return __A;
2345}
2346#endif
2347
2348#define _mm_cvtt_roundsh_i32(A, R) \
2349 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2350
2351static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2352 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2354}
2355
2356#ifdef __x86_64__
2357#define _mm_cvtt_roundsh_i64(A, R) \
2358 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2359
2360static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2361 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2363}
2364#endif
2365
2366#define _mm_cvtt_roundsh_u32(A, R) \
2367 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2368
2369static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2370_mm_cvttsh_u32(__m128h __A) {
2371 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2373}
2374
2375#ifdef __x86_64__
2376#define _mm_cvtt_roundsh_u64(A, R) \
2377 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2378
2379static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2380_mm_cvttsh_u64(__m128h __A) {
2381 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2382 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2383}
2384#endif
2385
2386#define _mm512_cvtx_roundph_ps(A, R) \
2387 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2388 (__v16sf)_mm512_undefined_ps(), \
2389 (__mmask16)(-1), (int)(R)))
2390
2391#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2392 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2393 (__mmask16)(U), (int)(R)))
2394
2395#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2396 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2397 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2398
2399static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2400 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2401 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2403}
2404
2405static __inline__ __m512 __DEFAULT_FN_ATTRS512
2406_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2409}
2410
2411static __inline__ __m512 __DEFAULT_FN_ATTRS512
2412_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2413 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2414 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2416}
2417
2418#define _mm512_cvtx_roundps_ph(A, R) \
2419 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2420 (__v16hf)_mm256_undefined_ph(), \
2421 (__mmask16)(-1), (int)(R)))
2422
2423#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2424 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2425 (__mmask16)(U), (int)(R)))
2426
2427#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2428 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2429 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2430
2431static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2432 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2433 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2435}
2436
2437static __inline__ __m256h __DEFAULT_FN_ATTRS512
2438_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2441}
2442
2443static __inline__ __m256h __DEFAULT_FN_ATTRS512
2444_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2445 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2446 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2448}
2449
2450#define _mm512_fmadd_round_ph(A, B, C, R) \
2451 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2452 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2453 (__mmask32)-1, (int)(R)))
2454
2455#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2456 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2457 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2458 (__mmask32)(U), (int)(R)))
2459
2460#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2461 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2462 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2463 (__mmask32)(U), (int)(R)))
2464
2465#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2466 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2467 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2468 (__mmask32)(U), (int)(R)))
2469
2470#define _mm512_fmsub_round_ph(A, B, C, R) \
2471 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2472 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2473 (__mmask32)-1, (int)(R)))
2474
2475#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2476 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2477 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2478 (__mmask32)(U), (int)(R)))
2479
2480#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2481 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2482 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2483 (__mmask32)(U), (int)(R)))
2484
2485#define _mm512_fnmadd_round_ph(A, B, C, R) \
2486 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2487 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2488 (__mmask32)-1, (int)(R)))
2489
2490#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2491 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2492 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2493 (__mmask32)(U), (int)(R)))
2494
2495#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2496 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2497 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2498 (__mmask32)(U), (int)(R)))
2499
2500#define _mm512_fnmsub_round_ph(A, B, C, R) \
2501 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2502 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2503 (__mmask32)-1, (int)(R)))
2504
2505#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2506 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2507 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2508 (__mmask32)(U), (int)(R)))
2509
2510static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2511 __m512h __B,
2512 __m512h __C) {
2513 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2514 (__v32hf)__C, (__mmask32)-1,
2516}
2517
2518static __inline__ __m512h __DEFAULT_FN_ATTRS512
2519_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2520 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2521 (__v32hf)__C, (__mmask32)__U,
2523}
2524
2525static __inline__ __m512h __DEFAULT_FN_ATTRS512
2526_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2527 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2528 (__v32hf)__C, (__mmask32)__U,
2530}
2531
2532static __inline__ __m512h __DEFAULT_FN_ATTRS512
2533_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2534 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2535 (__v32hf)__C, (__mmask32)__U,
2537}
2538
2539static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2540 __m512h __B,
2541 __m512h __C) {
2542 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2543 -(__v32hf)__C, (__mmask32)-1,
2545}
2546
2547static __inline__ __m512h __DEFAULT_FN_ATTRS512
2548_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2549 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2550 -(__v32hf)__C, (__mmask32)__U,
2552}
2553
2554static __inline__ __m512h __DEFAULT_FN_ATTRS512
2555_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2556 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2557 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2559}
2560
2561static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2562 __m512h __B,
2563 __m512h __C) {
2564 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2565 (__v32hf)__C, (__mmask32)-1,
2567}
2568
2569static __inline__ __m512h __DEFAULT_FN_ATTRS512
2570_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2571 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2572 (__v32hf)__C, (__mmask32)__U,
2574}
2575
2576static __inline__ __m512h __DEFAULT_FN_ATTRS512
2577_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2578 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2579 (__v32hf)__C, (__mmask32)__U,
2581}
2582
2583static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2584 __m512h __B,
2585 __m512h __C) {
2586 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2587 -(__v32hf)__C, (__mmask32)-1,
2589}
2590
2591static __inline__ __m512h __DEFAULT_FN_ATTRS512
2592_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2593 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2594 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2596}
2597
2598#define _mm512_fmaddsub_round_ph(A, B, C, R) \
2599 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2600 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2601 (__mmask32)-1, (int)(R)))
2602
2603#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2604 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2605 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2606 (__mmask32)(U), (int)(R)))
2607
2608#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2609 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2610 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2611 (__mmask32)(U), (int)(R)))
2612
2613#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2614 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2615 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2616 (__mmask32)(U), (int)(R)))
2617
2618#define _mm512_fmsubadd_round_ph(A, B, C, R) \
2619 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2620 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2621 (__mmask32)-1, (int)(R)))
2622
2623#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2624 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2625 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2626 (__mmask32)(U), (int)(R)))
2627
2628#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2629 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2630 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2631 (__mmask32)(U), (int)(R)))
2632
2633static __inline__ __m512h __DEFAULT_FN_ATTRS512
2634_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2635 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2636 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2638}
2639
2640static __inline__ __m512h __DEFAULT_FN_ATTRS512
2641_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2642 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2643 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645}
2646
2647static __inline__ __m512h __DEFAULT_FN_ATTRS512
2648_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2649 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2650 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652}
2653
2654static __inline__ __m512h __DEFAULT_FN_ATTRS512
2655_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2656 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2657 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2659}
2660
2661static __inline__ __m512h __DEFAULT_FN_ATTRS512
2662_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2663 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2664 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2666}
2667
2668static __inline__ __m512h __DEFAULT_FN_ATTRS512
2669_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2670 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2671 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673}
2674
2675static __inline__ __m512h __DEFAULT_FN_ATTRS512
2676_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2677 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2678 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2680}
2681
2682#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2683 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2684 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2685 (__mmask32)(U), (int)(R)))
2686
2687static __inline__ __m512h __DEFAULT_FN_ATTRS512
2688_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2689 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2690 (__v32hf)__C, (__mmask32)__U,
2692}
2693
2694#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2695 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2696 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2697 (__mmask32)(U), (int)(R)))
2698
2699static __inline__ __m512h __DEFAULT_FN_ATTRS512
2700_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2701 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2702 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2704}
2705
2706#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2707 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2708 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2709 (__mmask32)(U), (int)(R)))
2710
2711static __inline__ __m512h __DEFAULT_FN_ATTRS512
2712_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2713 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2714 (__v32hf)__C, (__mmask32)__U,
2716}
2717
2718#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2719 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2720 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2721 (__mmask32)(U), (int)(R)))
2722
2723#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2724 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2725 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2726 (__mmask32)(U), (int)(R)))
2727
2728static __inline__ __m512h __DEFAULT_FN_ATTRS512
2729_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2730 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2731 -(__v32hf)__C, (__mmask32)__U,
2733}
2734
2735static __inline__ __m512h __DEFAULT_FN_ATTRS512
2736_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2737 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2738 (__v32hf)__C, (__mmask32)__U,
2740}
2741
2742static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2743 __m128h __A,
2744 __m128h __B) {
2745 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747}
2748
2749static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2750 __mmask8 __U,
2751 __m128h __A,
2752 __m128h __B) {
2753 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755}
2756
2757#define _mm_fmadd_round_sh(A, B, C, R) \
2758 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2759 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2760 (__mmask8)-1, (int)(R)))
2761
2762#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2763 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2764 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2765 (__mmask8)(U), (int)(R)))
2766
2767static __inline__ __m128h __DEFAULT_FN_ATTRS128
2768_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2769 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2770 (__mmask8)__U,
2772}
2773
2774#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2775 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2776 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2777 (__mmask8)(U), (int)(R)))
2778
2779static __inline__ __m128h __DEFAULT_FN_ATTRS128
2780_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2781 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2782 (__mmask8)__U,
2784}
2785
2786#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2787 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2788 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2789 (__mmask8)(U), (int)(R)))
2790
2791static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2792 __m128h __A,
2793 __m128h __B) {
2794 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2795 -(__v8hf)__B, (__mmask8)-1,
2797}
2798
2799static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2800 __mmask8 __U,
2801 __m128h __A,
2802 __m128h __B) {
2803 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2804 -(__v8hf)__B, (__mmask8)__U,
2806}
2807
2808#define _mm_fmsub_round_sh(A, B, C, R) \
2809 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2810 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2811 (__mmask8)-1, (int)(R)))
2812
2813#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2814 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2815 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2816 (__mmask8)(U), (int)(R)))
2817
2818static __inline__ __m128h __DEFAULT_FN_ATTRS128
2819_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2820 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2821 -(__v8hf)__C, (__mmask8)__U,
2823}
2824
2825#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2826 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2827 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2828 (__mmask8)(U), (int)R))
2829
2830static __inline__ __m128h __DEFAULT_FN_ATTRS128
2831_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2832 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2833 (__mmask8)__U,
2835}
2836
2837#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2838 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2839 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2840 (__mmask8)(U), (int)(R)))
2841
2842static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2843 __m128h __A,
2844 __m128h __B) {
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847}
2848
2849static __inline__ __m128h __DEFAULT_FN_ATTRS128
2850_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2851 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853}
2854
2855#define _mm_fnmadd_round_sh(A, B, C, R) \
2856 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2857 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2858 (__mmask8)-1, (int)(R)))
2859
2860#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2861 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2862 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2863 (__mmask8)(U), (int)(R)))
2864
2865static __inline__ __m128h __DEFAULT_FN_ATTRS128
2866_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2867 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2868 (__mmask8)__U,
2870}
2871
2872#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2873 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2874 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2875 (__mmask8)(U), (int)(R)))
2876
2877static __inline__ __m128h __DEFAULT_FN_ATTRS128
2878_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2879 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2880 (__mmask8)__U,
2882}
2883
2884#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2885 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2886 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2887 (__mmask8)(U), (int)(R)))
2888
2889static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2890 __m128h __A,
2891 __m128h __B) {
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894}
2895
2896static __inline__ __m128h __DEFAULT_FN_ATTRS128
2897_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2898 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900}
2901
2902#define _mm_fnmsub_round_sh(A, B, C, R) \
2903 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2904 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2905 (__mmask8)-1, (int)(R)))
2906
2907#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2908 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2909 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2910 (__mmask8)(U), (int)(R)))
2911
2912static __inline__ __m128h __DEFAULT_FN_ATTRS128
2913_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2914 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2915 (__mmask8)__U,
2917}
2918
2919#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2920 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2921 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2922 (__mmask8)(U), (int)(R)))
2923
2924static __inline__ __m128h __DEFAULT_FN_ATTRS128
2925_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2926 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2927 (__mmask8)__U,
2929}
2930
2931#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2932 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2933 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2934 (__mmask8)(U), (int)(R)))
2935
2936static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2937 __m128h __B,
2938 __m128h __C) {
2939 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2940 (__v4sf)__C, (__mmask8)-1,
2942}
2943
2944static __inline__ __m128h __DEFAULT_FN_ATTRS128
2945_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2947 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2948}
2949
2950static __inline__ __m128h __DEFAULT_FN_ATTRS128
2951_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2952 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2953 (__v4sf)__C, (__mmask8)__U,
2955}
2956
2957static __inline__ __m128h __DEFAULT_FN_ATTRS128
2958_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2959 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2960 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2961}
2962
2963#define _mm_fcmadd_round_sch(A, B, C, R) \
2964 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2965 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2966 (__mmask8)-1, (int)(R)))
2967
2968#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2969 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2970 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2971 (__mmask8)(U), (int)(R)))
2972
2973#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2974 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2975 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2976 (__mmask8)(U), (int)(R)))
2977
2978#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2979 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2980 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2981 (__mmask8)(U), (int)(R)))
2982
2983static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2984 __m128h __B,
2985 __m128h __C) {
2986 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2987 (__v4sf)__C, (__mmask8)-1,
2989}
2990
2991static __inline__ __m128h __DEFAULT_FN_ATTRS128
2992_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2994 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2995}
2996
2997static __inline__ __m128h __DEFAULT_FN_ATTRS128
2998_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2999 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3000 (__v4sf)__C, (__mmask8)__U,
3002}
3003
3004static __inline__ __m128h __DEFAULT_FN_ATTRS128
3005_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3006 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3007 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3008}
3009
3010#define _mm_fmadd_round_sch(A, B, C, R) \
3011 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3012 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3013 (__mmask8)-1, (int)(R)))
3014
3015#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3016 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3017 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3018 (__mmask8)(U), (int)(R)))
3019
3020#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3021 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3022 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3023 (__mmask8)(U), (int)(R)))
3024
3025#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3026 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3027 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3028 (__mmask8)(U), (int)(R)))
3029
3030static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3031 __m128h __B) {
3032 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3033 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3035}
3036
3037static __inline__ __m128h __DEFAULT_FN_ATTRS128
3038_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3039 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3040 (__v4sf)__W, (__mmask8)__U,
3042}
3043
3044static __inline__ __m128h __DEFAULT_FN_ATTRS128
3045_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3046 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3047 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3049}
3050
3051#define _mm_fcmul_round_sch(A, B, R) \
3052 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3053 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3054 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3055
3056#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3057 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3058 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3059 (__mmask8)(U), (int)(R)))
3060
3061#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3062 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3063 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3064 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3065
3066static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3067 __m128h __B) {
3068 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3069 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3071}
3072
3073static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3074 __mmask8 __U,
3075 __m128h __A,
3076 __m128h __B) {
3077 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3078 (__v4sf)__W, (__mmask8)__U,
3080}
3081
3082static __inline__ __m128h __DEFAULT_FN_ATTRS128
3083_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3084 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3085 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3087}
3088
3089#define _mm_fmul_round_sch(A, B, R) \
3090 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3091 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3092 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3093
3094#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3095 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3096 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3097 (__mmask8)(U), (int)(R)))
3098
3099#define _mm_maskz_fmul_round_sch(U, A, B, R) \
3100 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3101 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3102 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3103
3104static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3105 __m512h __B) {
3106 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3107 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3109}
3110
3111static __inline__ __m512h __DEFAULT_FN_ATTRS512
3112_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3113 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3114 (__v16sf)__W, (__mmask16)__U,
3116}
3117
3118static __inline__ __m512h __DEFAULT_FN_ATTRS512
3119_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3120 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3121 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3123}
3124
3125#define _mm512_fcmul_round_pch(A, B, R) \
3126 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3127 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3128 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3129
3130#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3131 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3132 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3133 (__mmask16)(U), (int)(R)))
3134
3135#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3136 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3137 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3138 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3139
3140static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3141 __m512h __B) {
3142 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3143 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3145}
3146
3147static __inline__ __m512h __DEFAULT_FN_ATTRS512
3148_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3149 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3150 (__v16sf)__W, (__mmask16)__U,
3152}
3153
3154static __inline__ __m512h __DEFAULT_FN_ATTRS512
3155_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3156 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3157 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3159}
3160
3161#define _mm512_fmul_round_pch(A, B, R) \
3162 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3163 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3164 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3165
3166#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3167 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3168 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3169 (__mmask16)(U), (int)(R)))
3170
3171#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3172 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3173 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3174 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3175
3176static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3177 __m512h __B,
3178 __m512h __C) {
3179 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3180 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3182}
3183
3184static __inline__ __m512h __DEFAULT_FN_ATTRS512
3185_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3186 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3187 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189}
3190
3191static __inline__ __m512h __DEFAULT_FN_ATTRS512
3192_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3193 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3194 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196}
3197
3198static __inline__ __m512h __DEFAULT_FN_ATTRS512
3199_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3200 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3201 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3203}
3204
3205#define _mm512_fcmadd_round_pch(A, B, C, R) \
3206 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3207 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3208 (__mmask16)-1, (int)(R)))
3209
3210#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3211 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3212 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3213 (__mmask16)(U), (int)(R)))
3214
3215#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3216 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3217 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3218 (__mmask16)(U), (int)(R)))
3219
3220#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3221 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3222 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3223 (__mmask16)(U), (int)(R)))
3224
3225static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3226 __m512h __B,
3227 __m512h __C) {
3228 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3229 (__v16sf)__C, (__mmask16)-1,
3231}
3232
3233static __inline__ __m512h __DEFAULT_FN_ATTRS512
3234_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3235 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3236 (__v16sf)__C, (__mmask16)__U,
3238}
3239
3240static __inline__ __m512h __DEFAULT_FN_ATTRS512
3241_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3242 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3243 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245}
3246
3247static __inline__ __m512h __DEFAULT_FN_ATTRS512
3248_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3249 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3250 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3252}
3253
3254#define _mm512_fmadd_round_pch(A, B, C, R) \
3255 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3256 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3257 (__mmask16)-1, (int)(R)))
3258
3259#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3260 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3261 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3262 (__mmask16)(U), (int)(R)))
3263
3264#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3265 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3266 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3267 (__mmask16)(U), (int)(R)))
3268
3269#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3270 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3271 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3272 (__mmask16)(U), (int)(R)))
3273
3274static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3275_mm512_reduce_add_ph(__m512h __W) {
3276 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3277}
3278
3279static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3280_mm512_reduce_mul_ph(__m512h __W) {
3281 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3282}
3283
3284static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3285_mm512_reduce_max_ph(__m512h __V) {
3286 return __builtin_ia32_reduce_fmax_ph512(__V);
3287}
3288
3289static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3290_mm512_reduce_min_ph(__m512h __V) {
3291 return __builtin_ia32_reduce_fmin_ph512(__V);
3292}
3293
3294static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3295_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3296 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3297 (__v32hf)__A);
3298}
3299
3300static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3301_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3302 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3303 (__v32hi)__B);
3304}
3305
3306static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
3307_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3308 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3309}
3310
3311// intrinsics below are alias for f*mul_*ch
3312#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3313#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3314#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3315#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3316#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3317 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3318#define _mm512_maskz_mul_round_pch(U, A, B, R) \
3319 _mm512_maskz_fmul_round_pch(U, A, B, R)
3320
3321#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3322#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3323#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3324#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3325#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3326 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3327#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3328 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3329
3330#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3331#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3332#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3333#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3334#define _mm_mask_mul_round_sch(W, U, A, B, R) \
3335 _mm_mask_fmul_round_sch(W, U, A, B, R)
3336#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3337
3338#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3339#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3340#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3341#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3342#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3343 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3344#define _mm_maskz_cmul_round_sch(U, A, B, R) \
3345 _mm_maskz_fcmul_round_sch(U, A, B, R)
3346
3347#undef __DEFAULT_FN_ATTRS128
3348#undef __DEFAULT_FN_ATTRS256
3349#undef __DEFAULT_FN_ATTRS512
3350#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
3351#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
3352#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
3353
3354#endif
3355#endif
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ _Float16
static __inline__ vector float vector float __b
Definition altivec.h:578
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57
return __v
Definition arm_acle.h:88
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128_CONSTEXPR
Definition avx2intrin.h:30
#define __DEFAULT_FN_ATTRS256_CONSTEXPR
Definition avx2intrin.h:29
#define __DEFAULT_FN_ATTRS512_CONSTEXPR
#define __DEFAULT_FN_ATTRS512
unsigned int __mmask32
#define _mm512_setzero_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b)
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_pd(void)
unsigned char __mmask8
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w)
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void)
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s)
unsigned short __mmask16
#define _MM_FROUND_CUR_DIRECTION
#define _CMP_GT_OS
Definition avxintrin.h:1584
#define _CMP_GE_OS
Definition avxintrin.h:1583
#define _CMP_GT_OQ
Definition avxintrin.h:1600
#define _CMP_LE_OQ
Definition avxintrin.h:1588
#define _CMP_LT_OQ
Definition avxintrin.h:1587
#define _CMP_NEQ_US
Definition avxintrin.h:1590
#define _CMP_EQ_OS
Definition avxintrin.h:1586
#define _CMP_GE_OQ
Definition avxintrin.h:1599
static __inline__ void int __a
Definition emmintrin.h:4077
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition emmintrin.h:1765
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition emmintrin.h:1867
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1899
#define _CMP_LE_OS
Definition xmmintrin.h:3024
#define _CMP_NEQ_UQ
Definition xmmintrin.h:3026
#define _CMP_LT_OS
Definition xmmintrin.h:3023
#define _CMP_EQ_OQ
Definition xmmintrin.h:3022
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021