clang 22.0.0git
avx10_2convertintrin.h
Go to the documentation of this file.
1/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error \
11 "Never use <avx10_2convertintrin.h> directly; include <immintrin.h> instead."
12#endif // __IMMINTRIN_H
13
14#ifdef __SSE2__
15
16#ifndef __AVX10_2CONVERTINTRIN_H
17#define __AVX10_2CONVERTINTRIN_H
18
19/* Define the default attributes for the functions in this file. */
20#define __DEFAULT_FN_ATTRS128 \
21 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
22 __min_vector_width__(128)))
23#define __DEFAULT_FN_ATTRS256 \
24 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \
25 __min_vector_width__(256)))
26
27// clang-format off
28
29/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
30/// single-precision (32-bit) floating-point elements to a 128-bit vector
31/// containing FP16 elements.
32///
33/// \code{.operation}
34/// FOR i := 0 to 7
35/// IF i < 4
36/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
37/// ELSE
38/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
39/// FI
40///
41/// ENDFOR
42///
43/// dst[MAX:128] := 0
44/// \endcode
45///
46/// \headerfile <immintrin.h>
47///
48/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
49///
50/// \param __A
51/// A 128-bit vector of [4 x float].
52/// \param __B
53/// A 128-bit vector of [4 x float].
54/// \returns
55/// A 128-bit vector of [8 x fp16]. Lower 4 elements correspond to the
56/// (converted) elements from \a __B; higher order elements correspond to the
57/// (converted) elements from \a __A.
58static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
59 __m128 __B) {
60 return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
61 (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
62}
63
64/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
65/// single-precision (32-bit) floating-point elements to a 128-bit vector
66/// containing FP16 elements. Merging mask \a __U is used to determine if given
67/// element should be taken from \a __W instead.
68///
69/// \code{.operation}
70/// FOR i := 0 to 7
71/// IF __U[i]
72/// IF i < 4
73/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
74/// ELSE
75/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
76/// FI
77/// ELSE
78/// dst.fp16[i] := __W.fp16[i]
79/// FI
80/// ENDFOR
81///
82/// dst[MAX:128] := 0
83/// \endcode
84///
85/// \headerfile <immintrin.h>
86///
87/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
88///
89/// \param __W
90/// A 128-bit vector of [8 x fp16].
91/// \param __U
92/// A 8-bit merging mask.
93/// \param __A
94/// A 128-bit vector of [4 x float].
95/// \param __B
96/// A 128-bit vector of [4 x float].
97/// \returns
98/// A 128-bit vector of [8 x fp16]. Lower elements correspond to the
99/// (converted) elements from \a __B; higher order elements correspond to the
100/// (converted) elements from \a __A. If corresponding mask bit is not set, then
101/// element from \a __W is taken instead.
102static __inline__ __m128h __DEFAULT_FN_ATTRS128
103_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
104 return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
105 (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
106}
107
108/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
109/// single-precision (32-bit) floating-point elements to a 128-bit vector
110/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
111/// element should be zeroed instead.
112///
113/// \code{.operation}
114/// FOR i := 0 to 7
115/// IF __U[i]
116/// IF i < 4
117/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
118/// ELSE
119/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
120/// FI
121/// ELSE
122/// dst.fp16[i] := 0
123/// FI
124/// ENDFOR
125///
126/// dst[MAX:128] := 0
127/// \endcode
128///
129/// \headerfile <immintrin.h>
130///
131/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
132///
133/// \param __U
134/// A 8-bit zeroing mask.
135/// \param __A
136/// A 128-bit vector of [4 x float].
137/// \param __B
138/// A 128-bit vector of [4 x float].
139/// \returns
140/// A 128-bit vector of [8 x fp16]. Lower elements correspond to the
141/// (converted) elements from \a __B; higher order elements correspond to the
142/// (converted) elements from \a __A. If corresponding mask bit is not set,
143/// then zero is taken instead.
144static __inline__ __m128h __DEFAULT_FN_ATTRS128
145_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
146 return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
147 (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
148}
149
150/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
151/// single-precision (32-bit) floating-point elements to a 256-bit vector
152/// containing FP16 elements.
153///
154/// \code{.operation}
155/// FOR i := 0 to 15
156/// IF i < 8
157/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
158/// ELSE
159/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
160/// FI
161/// ENDFOR
162///
163/// dst[MAX:256] := 0
164/// \endcode
165///
166/// \headerfile <immintrin.h>
167///
168/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
169///
170/// \param __A
171/// A 256-bit vector of [8 x float].
172/// \param __B
173/// A 256-bit vector of [8 x float].
174/// \returns
175/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
176/// (converted) elements from \a __B; higher order elements correspond to the
177/// (converted) elements from \a __A.
178static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
179 __m256 __B) {
180 return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
181 (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1));
182}
183
184/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
185/// single-precision (32-bit) floating-point elements to a 256-bit vector
186/// containing FP16 elements. Merging mask \a __U is used to determine if given
187/// element should be taken from \a __W instead.
188///
189/// \code{.operation}
190/// FOR i := 0 to 15
191/// IF __U[i]
192/// IF i < 8
193/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
194/// ELSE
195/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
196/// FI
197/// ELSE
198/// dst.fp16[i] := __W.fp16[i]
199/// FI
200/// ENDFOR
201///
202/// dst[MAX:256] := 0
203/// \endcode
204///
205/// \headerfile <immintrin.h>
206///
207/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
208///
209/// \param __W
210/// A 256-bit vector of [16 x fp16].
211/// \param __U
212/// A 16-bit merging mask.
213/// \param __A
214/// A 256-bit vector of [8 x float].
215/// \param __B
216/// A 256-bit vector of [8 x float].
217/// \returns
218/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
219/// (converted) elements from \a __B; higher order elements correspond to the
220/// (converted) elements from \a __A. If corresponding mask bit is not set, then
221/// element from \a __W is taken instead.
222static __inline__ __m256h __DEFAULT_FN_ATTRS256
223_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
224 return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
225 (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U);
226}
227
228/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
229/// single-precision (32-bit) floating-point elements to a 256-bit vector
230/// containing FP16 elements. Zeroing mask \a __U is used to determine if given
231/// element should be zeroed instead.
232///
233/// \code{.operation}
234/// FOR i := 0 to 15
235/// IF __U[i]
236/// IF i < 8
237/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
238/// ELSE
239/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
240/// FI
241/// ELSE
242/// dst.fp16[i] := 0
243/// FI
244/// ENDFOR
245///
246/// dst[MAX:256] := 0
247/// \endcode
248///
249/// \headerfile <immintrin.h>
250///
251/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
252///
253/// \param __U
254/// A 16-bit zeroing mask.
255/// \param __A
256/// A 256-bit vector of [8 x float].
257/// \param __B
258/// A 256-bit vector of [8 x float].
259/// \returns
260/// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
261/// (converted) elements from \a __B; higher order elements correspond to the
262/// (converted) elements from \a __A. If corresponding mask bit is not set,
263/// then zero is taken instead.
264static __inline__ __m256h __DEFAULT_FN_ATTRS256
265_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
266 return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
267 (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
268}
269
270/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
271/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
272/// 16-bit integer stored in \a __B.
273///
274/// \code{.operation}
275/// FOR i := 0 to 7
276/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
277/// ENDFOR
278///
279/// dst[MAX:64] := 0
280/// \endcode
281///
282/// \headerfile <immintrin.h>
283///
284/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
285///
286/// \param __A
287/// A 128-bit vector of [8 x int16].
288/// \param __B
289/// A 128-bit vector of [8 x fp16].
290/// \returns
291/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
292/// converted elements from \a __B using biases from \a __A; higher order
293/// elements are zeroed.
294static __inline__ __m128i __DEFAULT_FN_ATTRS128
295_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) {
296 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
297 (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
298}
299
300/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
301/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
302/// 16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
303/// given element should be taken from \a __W instead.
304///
305/// \code{.operation}
306/// FOR i := 0 to 7
307/// IF __U[i]
308/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
309/// ELSE
310/// dst.bf8[i] := __W.bf8[i]
311/// FI
312/// ENDFOR
313///
314/// dst[MAX:64] := 0
315/// \endcode
316///
317/// \headerfile <immintrin.h>
318///
319/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
320///
321/// \param __W
322/// A 128-bit vector of [16 x bf8].
323/// \param __U
324/// A 8-bit merging mask.
325/// \param __A
326/// A 128-bit vector of [8 x int16].
327/// \param __B
328/// A 128-bit vector of [8 x fp16].
329/// \returns
330/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
331/// converted elements from \a __B, using biases from \a __A; higher order
332/// elements are zeroed. If corresponding mask bit is not set, then element
333/// from \a __W is taken instead.
334static __inline__ __m128i __DEFAULT_FN_ATTRS128
335_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
336 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
337 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
338}
339
340/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
341/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
342/// 16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
343/// given element should be zeroed instead.
344///
345/// \code{.operation}
346/// FOR i := 0 to 7
347/// IF __U[i]
348/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
349/// ELSE
350/// dst.bf8[i] := 0
351/// FI
352/// ENDFOR
353///
354/// dst[MAX:64] := 0
355/// \endcode
356///
357/// \headerfile <immintrin.h>
358///
359/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
360///
361/// \param __U
362/// A 8-bit zeroing mask.
363/// \param __A
364/// A 128-bit vector of [8 x int16].
365/// \param __B
366/// A 128-bit vector of [8 x fp16].
367/// \returns
368/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
369/// converted elements from \a __B, using biases from \a __A; higher order
370/// elements are zeroed. If corresponding mask bit is not set, then element
371/// is zeroed.
372static __inline__ __m128i __DEFAULT_FN_ATTRS128
373_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
374 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
375 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
376 (__mmask8)__U);
377}
378
379/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
380/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
381/// 16-bit integer stored in \a __B.
382///
383/// \code{.operation}
384/// FOR i := 0 to 15
385/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
386/// ENDFOR
387///
388/// dst[MAX:128] := 0
389/// \endcode
390///
391/// \headerfile <immintrin.h>
392///
393/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
394///
395/// \param __A
396/// A 256-bit vector of [16 x int16].
397/// \param __B
398/// A 256-bit vector of [16 x fp16].
399/// \returns
400/// A 128-bit vector of [16 x bf8]. Elements correspond to the
401/// converted elements from \a __B using biases from \a __A.
402static __inline__ __m128i __DEFAULT_FN_ATTRS256
403_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) {
404 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
405 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
406 (__mmask16)-1);
407}
408
409/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
410/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
411/// 16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
412/// given element should be taken from \a __W instead.
413///
414/// \code{.operation}
415/// FOR i := 0 to 15
416/// IF __U[i]
417/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
418/// ELSE
419/// dst.bf8[i] := __W.bf8[i]
420/// FI
421/// ENDFOR
422///
423/// dst[MAX:128] := 0
424/// \endcode
425///
426/// \headerfile <immintrin.h>
427///
428/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
429///
430/// \param __W
431/// A 128-bit vector of [16 x bf8].
432/// \param __U
433/// A 16-bit merging mask.
434/// \param __A
435/// A 256-bit vector of [16 x int16].
436/// \param __B
437/// A 256-bit vector of [16 x fp16].
438/// \returns
439/// A 128-bit vector of [16 x bf8]. Elements correspond to the converted
440/// elements from \a __B, using biases from \a __A. If corresponding mask bit
441/// is not set, then element from \a __W is taken instead.
442static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8(
443 __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
444 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
445 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
446}
447
448/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
449/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
450/// 16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
451/// given element should be zeroed instead.
452///
453/// \code{.operation}
454/// FOR i := 0 to 15
455/// IF __U[i]
456/// dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
457/// ELSE
458/// dst.bf8[i] := 0
459/// FI
460/// ENDFOR
461///
462/// dst[MAX:128] := 0
463/// \endcode
464///
465/// \headerfile <immintrin.h>
466///
467/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
468///
469/// \param __U
470/// A 16-bit zeroing mask.
471/// \param __A
472/// A 256-bit vector of [16 x int16].
473/// \param __B
474/// A 256-bit vector of [16 x fp16].
475/// \returns
476/// A 128-bit vector of [16 x bf8]. Elements correspond to the converted
477/// elements from \a __B, using biases from \a __A. If corresponding mask bit
478/// is not set, then element is zeroed.
479static __inline__ __m128i __DEFAULT_FN_ATTRS256
480_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
481 return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
482 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
483 (__mmask16)__U);
484}
485
486/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
487/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
488/// 16-bit integer stored in \a __B. Results are saturated.
489///
490/// \code{.operation}
491/// FOR i := 0 to 7
492/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
493/// ENDFOR
494///
495/// dst[MAX:64] := 0
496/// \endcode
497///
498/// \headerfile <immintrin.h>
499///
500/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
501///
502/// \param __A
503/// A 128-bit vector of [8 x int16].
504/// \param __B
505/// A 128-bit vector of [8 x fp16].
506/// \returns
507/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
508/// converted elements from \a __B using biases from \a __A; higher order
509/// elements are zeroed.
510static __inline__ __m128i __DEFAULT_FN_ATTRS128
511_mm_cvts_biasph_bf8(__m128i __A, __m128h __B) {
512 return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
513 (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
514}
515
516/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
517/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
518/// 16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
519/// is used to determine if given element should be taken from \a __W instead.
520///
521/// \code{.operation}
522/// FOR i := 0 to 7
523/// IF __U[i]
524/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
525/// ELSE
526/// dst.bf8[i] := __W.bf8[i]
527/// FI
528/// ENDFOR
529///
530/// dst[MAX:64] := 0
531/// \endcode
532///
533/// \headerfile <immintrin.h>
534///
535/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
536///
537/// \param __W
538/// A 128-bit vector of [16 x bf8].
539/// \param __U
540/// A 8-bit merging mask.
541/// \param __A
542/// A 128-bit vector of [8 x int16].
543/// \param __B
544/// A 128-bit vector of [8 x fp16].
545/// \returns
546/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
547/// converted elements from \a __B, using biases from \a __A; higher order
548/// elements are zeroed. If corresponding mask bit is not set, then element
549/// from \a __W is taken instead.
550static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvts_biasph_bf8(__m128i
551 __W, __mmask8 __U, __m128i __A, __m128h __B) { return
552 (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( (__v16qi)__A,
553 (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); }
554
555/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
556/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
557/// 16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
558/// is used to determine if given element should be zeroed instead.
559///
560/// \code{.operation}
561/// FOR i := 0 to 7
562/// IF __U[i]
563/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
564/// ELSE
565/// dst.bf8[i] := 0
566/// FI
567/// ENDFOR
568///
569/// dst[MAX:64] := 0
570/// \endcode
571///
572/// \headerfile <immintrin.h>
573///
574/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
575///
576/// \param __U
577/// A 8-bit zeroing mask.
578/// \param __A
579/// A 128-bit vector of [8 x int16].
580/// \param __B
581/// A 128-bit vector of [8 x fp16].
582/// \returns
583/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
584/// converted elements from \a __B, using biases from \a __A; higher order
585/// elements are zeroed. If corresponding mask bit is not set, then element
586/// is zeroed.
587static __inline__ __m128i __DEFAULT_FN_ATTRS128
588_mm_maskz_cvts_biasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
589 return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
590 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
591 (__mmask8)__U);
592}
593
594
595/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
596/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
597/// 16-bit integer stored in \a __B. Results are saturated.
598///
599/// \code{.operation}
600/// FOR i := 0 to 15
601/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
602/// ENDFOR
603///
604/// dst[MAX:128] := 0
605/// \endcode
606///
607/// \headerfile <immintrin.h>
608///
609/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
610///
611/// \param __A
612/// A 256-bit vector of [16 x int16].
613/// \param __B
614/// A 256-bit vector of [16 x fp16].
615/// \returns
616/// A 128-bit vector of [16 x bf8]. Elements correspond to the
617/// converted elements from \a __B using biases from \a __A.
618static __inline__ __m128i __DEFAULT_FN_ATTRS256
619_mm256_cvts_biasph_bf8(__m256i __A, __m256h __B) {
620 return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
621 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
622 (__mmask16)-1);
623}
624
625/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
626/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
627/// 16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
628/// is used to determine if given element should be taken from \a __W instead.
629///
630/// \code{.operation}
631/// FOR i := 0 to 15
632/// IF __U[i]
633/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
634/// ELSE
635/// dst.bf8[i] := __W.bf8[i]
636/// FI
637/// ENDFOR
638///
639/// dst[MAX:128] := 0
640/// \endcode
641///
642/// \headerfile <immintrin.h>
643///
644/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
645///
646/// \param __W
647/// A 128-bit vector of [16 x bf8].
648/// \param __U
649/// A 16-bit merging mask.
650/// \param __A
651/// A 256-bit vector of [16 x int16].
652/// \param __B
653/// A 256-bit vector of [16 x fp16].
654/// \returns
655/// A 128-bit vector of [16 x bf8]. Elements correspond to the converted
656/// elements from \a __B, using biases from \a __A. If corresponding mask bit
657/// is not set, then element from \a __W is taken instead.
658static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_biasph_bf8(
659 __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
660 return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
661 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
662}
663
664/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
665/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
666/// 16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
667/// is used to determine if given element should be zeroed instead.
668///
669/// \code{.operation}
670/// FOR i := 0 to 15
671/// IF __U[i]
672/// dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
673/// ELSE
674/// dst.bf8[i] := 0
675/// FI
676/// ENDFOR
677///
678/// dst[MAX:128] := 0
679/// \endcode
680///
681/// \headerfile <immintrin.h>
682///
683/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
684///
685/// \param __U
686/// A 16-bit zeroing mask.
687/// \param __A
688/// A 256-bit vector of [16 x int16].
689/// \param __B
690/// A 256-bit vector of [16 x fp16].
691/// \returns
692/// A 128-bit vector of [16 x bf8]. Elements correspond to the converted
693/// elements from \a __B, using biases from \a __A. If corresponding mask bit
694/// is not set, then element is zeroed.
695static __inline__ __m128i __DEFAULT_FN_ATTRS256
696_mm256_maskz_cvts_biasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
697 return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
698 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
699 (__mmask16)__U);
700}
701
702/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
703/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
704/// 16-bit integer stored in \a __B.
705///
706/// \code{.operation}
707/// FOR i := 0 to 7
708/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
709/// ENDFOR
710///
711/// dst[MAX:64] := 0
712/// \endcode
713///
714/// \headerfile <immintrin.h>
715///
716/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
717///
718/// \param __A
719/// A 128-bit vector of [8 x int16].
720/// \param __B
721/// A 128-bit vector of [8 x fp16].
722/// \returns
723/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
724/// converted elements from \a __B using biases from \a __A; higher order
725/// elements are zeroed.
726static __inline__ __m128i __DEFAULT_FN_ATTRS128
727_mm_cvtbiasph_hf8(__m128i __A, __m128h __B) {
728 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
729 (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
730}
731
732/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
733/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
734/// 16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
735/// given element should be taken from \a __W instead.
736///
737/// \code{.operation}
738/// FOR i := 0 to 7
739/// IF __U[i]
740/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
741/// ELSE
742/// dst.hf8[i] := __W.hf8[i]
743/// FI
744/// ENDFOR
745///
746/// dst[MAX:64] := 0
747/// \endcode
748///
749/// \headerfile <immintrin.h>
750///
751/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
752///
753/// \param __W
754/// A 128-bit vector of [16 x hf8].
755/// \param __U
756/// A 8-bit merging mask.
757/// \param __A
758/// A 128-bit vector of [8 x int16].
759/// \param __B
760/// A 128-bit vector of [8 x fp16].
761/// \returns
762/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
763/// converted elements from \a __B, using biases from \a __A; higher order
764/// elements are zeroed. If corresponding mask bit is not set, then element
765/// from \a __W is taken instead.
766static __inline__ __m128i __DEFAULT_FN_ATTRS128
767_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
768 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
769 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
770}
771
772/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
773/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
774/// 16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
775/// given element should be zeroed instead.
776///
777/// \code{.operation}
778/// FOR i := 0 to 7
779/// IF __U[i]
780/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
781/// ELSE
782/// dst.hf8[i] := 0
783/// FI
784/// ENDFOR
785///
786/// dst[MAX:64] := 0
787/// \endcode
788///
789/// \headerfile <immintrin.h>
790///
791/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
792///
793/// \param __U
794/// A 8-bit zeroing mask.
795/// \param __A
796/// A 128-bit vector of [8 x int16].
797/// \param __B
798/// A 128-bit vector of [8 x fp16].
799/// \returns
800/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
801/// converted elements from \a __B, using biases from \a __A; higher order
802/// elements are zeroed. If corresponding mask bit is not set, then element
803/// is zeroed.
804static __inline__ __m128i __DEFAULT_FN_ATTRS128
805_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
806 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
807 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
808 (__mmask8)__U);
809}
810
811/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
812/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
813/// 16-bit integer stored in \a __B.
814///
815/// \code{.operation}
816/// FOR i := 0 to 15
817/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
818/// ENDFOR
819///
820/// dst[MAX:128] := 0
821/// \endcode
822///
823/// \headerfile <immintrin.h>
824///
825/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
826///
827/// \param __A
828/// A 256-bit vector of [16 x half].
829/// \param __B
830/// A 256-bit vector of [16 x i16].
831/// \returns
832/// A 128-bit vector of [16 x hf8]. Elements correspond to the
833/// converted elements from \a __B using biases from \a __A.
834static __inline__ __m128i __DEFAULT_FN_ATTRS256
835_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) {
836 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
837 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
838 (__mmask16)-1);
839}
840
841/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
842/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
843/// 16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
844/// given element should be taken from \a __W instead.
845///
846/// \code{.operation}
847/// FOR i := 0 to 15
848/// IF __U[i]
849/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
850/// ELSE
851/// dst.hf8[i] := __W.hf8[i]
852/// FI
853/// ENDFOR
854///
855/// dst[MAX:128] := 0
856/// \endcode
857///
858/// \headerfile <immintrin.h>
859///
860/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
861///
862/// \param __W
863/// A 128-bit vector of [16 x hf8].
864/// \param __U
865/// A 16-bit merging mask.
866/// \param __A
867/// A 256-bit vector of [16 x int16].
868/// \param __B
869/// A 256-bit vector of [16 x fp16].
870/// \returns
871/// A 128-bit vector of [16 x hf8]. Elements correspond to the converted
872/// elements from \a __B, using biases from \a __A. If corresponding mask bit
873/// is not set, then element from \a __W is taken instead.
874static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8(
875 __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
876 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
877 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
878}
879
880/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
881/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
882/// 16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
883/// given element should be taken zeroed instead.
884///
885/// \code{.operation}
886/// FOR i := 0 to 15
887/// IF __U[i]
888/// dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
889/// ELSE
890/// dst.hf8[i] := 0
891/// FI
892/// ENDFOR
893///
894/// dst[MAX:128] := 0
895/// \endcode
896///
897/// \headerfile <immintrin.h>
898///
899/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
900///
901/// \param __U
902/// A 16-bit zeroing mask.
903/// \param __A
904/// A 256-bit vector of [16 x half].
905/// \param __B
906/// A 256-bit vector of [16 x i16].
907/// \returns
908/// A 128-bit vector of [16 x hf8]. Elements correspond to the converted
909/// elements from \a __B, using biases from \a __A. If corresponding mask bit
910/// is not set, then element is zeroed.
911static __inline__ __m128i __DEFAULT_FN_ATTRS256
912_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
913 return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
914 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
915 (__mmask16)__U);
916}
917
918/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
919/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
920/// 16-bit integer stored in \a __B. Results are saturated.
921///
922/// \code{.operation}
923/// FOR i := 0 to 7
924/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
925/// ENDFOR
926///
927/// dst[MAX:64] := 0
928/// \endcode
929///
930/// \headerfile <immintrin.h>
931///
932/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S`instruction.
933///
934/// \param __A
935/// A 128-bit vector of [8 x int16].
936/// \param __B
937/// A 128-bit vector of [8 x fp16].
938/// \returns
939/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
940/// converted elements from \a __B using biases from \a __A; higher order
941/// elements are zeroed.
942static __inline__ __m128i __DEFAULT_FN_ATTRS128
943_mm_cvts_biasph_hf8(__m128i __A, __m128h __B) {
944 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
945 (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
946}
947
948/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
949/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
950/// 16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
951/// is used to determine if given element should be taken from \a __W instead.
952///
953/// \code{.operation}
954/// FOR i := 0 to 7
955/// IF __U[i]
956/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
957/// ELSE
958/// dst.hf8[i] := __W.hf8[i]
959/// FI
960/// ENDFOR
961///
962/// dst[MAX:64] := 0
963/// \endcode
964///
965/// \headerfile <immintrin.h>
966///
967/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
968///
969/// \param __W
970/// A 128-bit vector of [16 x hf8].
971/// \param __U
972/// A 8-bit merging mask.
973/// \param __A
974/// A 128-bit vector of [8 x int16].
975/// \param __B
976/// A 128-bit vector of [8 x fp16].
977/// \returns
978/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
979/// converted elements from \a __B, using biases from \a __A; higher order
980/// elements are zeroed. If corresponding mask bit is not set, then element
981/// from \a __W is taken instead.
982static __inline__ __m128i __DEFAULT_FN_ATTRS128
983_mm_mask_cvts_biasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
984 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
985 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
986}
987
988/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
989/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
990/// 16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
991/// is used to determine if given element should be zeroed instead.
992///
993/// \code{.operation}
994/// FOR i := 0 to 7
995/// IF __U[i]
996/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
997/// ELSE
998/// dst.hf8[i] := 0
999/// FI
1000/// ENDFOR
1001///
1002/// dst[MAX:64] := 0
1003/// \endcode
1004///
1005/// \headerfile <immintrin.h>
1006///
1007/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1008///
1009/// \param __U
1010/// A 8-bit zeroing mask.
1011/// \param __A
1012/// A 128-bit vector of [8 x int16].
1013/// \param __B
1014/// A 128-bit vector of [8 x fp16].
1015/// \returns
1016/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
1017/// converted elements from \a __B, using biases from \a __A; higher order
1018/// elements are zeroed. If corresponding mask bit is not set, then element
1019/// is zeroed.
1020static __inline__ __m128i __DEFAULT_FN_ATTRS128
1021_mm_maskz_cvts_biasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
1022 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
1023 (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
1024 (__mmask8)__U);
1025}
1026
1027/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1028/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1029/// 16-bit integer stored in \a __B. Results are saturated.
1030///
1031/// \code{.operation}
1032/// FOR i := 0 to 15
1033/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1034/// ENDFOR
1035///
1036/// dst[MAX:128] := 0
1037/// \endcode
1038///
1039/// \headerfile <immintrin.h>
1040///
1041/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1042///
1043/// \param __A
1044/// A 256-bit vector of [16 x int16].
1045/// \param __B
1046/// A 256-bit vector of [16 x fp16].
1047/// \returns
1048/// A 128-bit vector of [16 x hf8]. Elements correspond to the
1049/// converted elements from \a __B using biases from \a __A.
1050static __inline__ __m128i __DEFAULT_FN_ATTRS256
1051_mm256_cvts_biasph_hf8(__m256i __A, __m256h __B) {
1052 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1053 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
1054 (__mmask16)-1);
1055}
1056
1057/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1058/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1059/// 16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
1060/// is used to determine if given element should be taken from \a __W instead.
1061///
1062/// \code{.operation}
1063/// FOR i := 0 to 15
1064/// IF __U[i]
1065/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1066/// ELSE
1067/// dst.hf8[i] := __W.hf8[i]
1068/// FI
1069/// ENDFOR
1070///
1071/// dst[MAX:128] := 0
1072/// \endcode
1073///
1074/// \headerfile <immintrin.h>
1075///
1076/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1077///
1078/// \param __W
1079/// A 128-bit vector of [16 x hf8].
1080/// \param __U
1081/// A 16-bit merging mask.
1082/// \param __A
1083/// A 256-bit vector of [16 x int16].
1084/// \param __B
1085/// A 256-bit vector of [16 x fp16].
1086/// \returns
1087/// A 128-bit vector of [16 x hf8]. Elements correspond to the converted
1088/// elements from \a __B, using biases from \a __A. If corresponding mask bit
1089/// is not set, then element from \a __W is taken instead.
1090static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_biasph_hf8(
1091 __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
1092 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1093 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
1094}
1095
1096/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1097/// to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1098/// 16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
1099/// is used to determine if given element should be zeroed instead.
1100///
1101/// \code{.operation}
1102/// FOR i := 0 to 15
1103/// IF __U[i]
1104/// dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1105/// ELSE
1106/// dst.hf8[i] := 0
1107/// FI
1108/// ENDFOR
1109///
1110/// dst[MAX:128] := 0
1111/// \endcode
1112///
1113/// \headerfile <immintrin.h>
1114///
1115/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1116///
1117/// \param __U
1118/// A 16-bit zeroing mask.
1119/// \param __A
1120/// A 256-bit vector of [16 x int16].
1121/// \param __B
1122/// A 256-bit vector of [16 x fp16].
1123/// \returns
1124/// A 128-bit vector of [16 x hf8]. Elements correspond to the converted
1125/// elements from \a __B, using biases from \a __A. If corresponding mask bit
1126/// is not set, then element is zeroed.
1127static __inline__ __m128i __DEFAULT_FN_ATTRS256
1128_mm256_maskz_cvts_biasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
1129 return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1130 (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
1131 (__mmask16)__U);
1132}
1133
1134/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1135/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1136///
1137/// \code{.operation}
1138/// FOR i := 0 to 15
1139/// IF i < 8
1140/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1141/// ELSE
1142/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1143/// FI
1144/// ENDFOR
1145///
1146/// dst[MAX:128] := 0
1147/// \endcode
1148///
1149/// \headerfile <immintrin.h>
1150///
1151/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1152///
1153/// \param __A
1154/// A 128-bit vector of [8 x fp16].
1155/// \param __B
1156/// A 128-bit vector of [8 x fp16].
1157/// \returns
1158/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1159/// (converted) elements from \a __B; higher order elements correspond to the
1160/// (converted) elements from \a __A.
1161static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A,
1162 __m128h __B) {
1163 return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A),
1164 (__v8hf)(__B));
1165}
1166
1167/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1168/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1169/// Merging mask \a __U is used to determine if given element should be taken
1170/// from \a __W instead.
1171///
1172/// \code{.operation}
1173/// FOR i := 0 to 15
1174/// IF __U[i]
1175/// IF i < 8
1176/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1177/// ELSE
1178/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1179/// FI
1180/// ELSE
1181/// dst.bf8[i] := __W.bf8[i]
1182/// FI
1183/// ENDFOR
1184///
1185/// dst[MAX:128] := 0
1186/// \endcode
1187///
1188/// \headerfile <immintrin.h>
1189///
1190/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1191///
1192/// \param __W
1193/// A 128-bit vector of [16 x bf8].
1194/// \param __U
1195/// A 16-bit merging mask.
1196/// \param __A
1197/// A 128-bit vector of [8 x fp16].
1198/// \param __B
1199/// A 128-bit vector of [8 x fp16].
1200/// \returns
1201/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1202/// (converted) elements from \a __B; higher order elements correspond to the
1203/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1204/// element from \a __W is taken instead.
1205static __inline__ __m128i __DEFAULT_FN_ATTRS128
1206_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1207 return (__m128i)__builtin_ia32_selectb_128(
1208 (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W);
1209}
1210
1211/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1212/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1213/// Zeroing mask \a __U is used to determine if given element should be zeroed
1214/// instead.
1215///
1216/// \code{.operation}
1217/// FOR i := 0 to 15
1218/// IF __U[i]
1219/// IF i < 8
1220/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1221/// ELSE
1222/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1223/// FI
1224/// ELSE
1225/// dst.bf8[i] := 0
1226/// FI
1227/// ENDFOR
1228///
1229/// dst[MAX:128] := 0
1230/// \endcode
1231///
1232/// \headerfile <immintrin.h>
1233///
1234/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1235///
1236/// \param __U
1237/// A 16-bit zeroing mask.
1238/// \param __A
1239/// A 128-bit vector of [8 x fp16].
1240/// \param __B
1241/// A 128-bit vector of [8 x fp16].
1242/// \returns
1243/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1244/// (converted) elements from \a __B; higher order elements correspond to the
1245/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1246/// zero is taken instead.
1247static __inline__ __m128i __DEFAULT_FN_ATTRS128
1248_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
1249 return (__m128i)__builtin_ia32_selectb_128(
1250 (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B),
1251 (__v16qi)(__m128i)_mm_setzero_si128());
1252}
1253
1254/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1255/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1256///
1257/// \code{.operation}
1258/// FOR i := 0 to 31
1259/// IF i < 16
1260/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1261/// ELSE
1262/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1263/// FI
1264/// ENDFOR
1265///
1266/// dst[MAX:256] := 0
1267/// \endcode
1268///
1269/// \headerfile <immintrin.h>
1270///
1271/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1272///
1273/// \param __A
1274/// A 256-bit vector of [16 x fp16].
1275/// \param __B
1276/// A 256-bit vector of [16 x fp16].
1277/// \returns
1278/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1279/// (converted) elements from \a __B; higher order elements correspond to the
1280/// (converted) elements from \a __A.
1281static __inline__ __m256i __DEFAULT_FN_ATTRS256
1282_mm256_cvt2ph_bf8(__m256h __A, __m256h __B) {
1283 return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A),
1284 (__v16hf)(__B));
1285}
1286
1287/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1288/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1289/// Merging mask \a __U is used to determine if given element should be taken
1290/// from \a __W instead.
1291///
1292/// \code{.operation}
1293/// FOR i := 0 to 31
1294/// IF __U[i]
1295/// IF i < 16
1296/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1297/// ELSE
1298/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1299/// FI
1300/// ELSE
1301/// dst.bf8[i] := __W.bf8[i]
1302/// FI
1303/// ENDFOR
1304///
1305/// dst[MAX:256] := 0
1306/// \endcode
1307///
1308/// \headerfile <immintrin.h>
1309///
1310/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1311///
1312/// \param __W
1313/// A 256-bit vector of [32 x bf8].
1314/// \param __U
1315/// A 32-bit merging mask.
1316/// \param __A
1317/// A 256-bit vector of [16 x fp16].
1318/// \param __B
1319/// A 256-bit vector of [16 x fp16].
1320/// \returns
1321/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1322/// (converted) elements from \a __B; higher order elements correspond to the
1323/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1324/// element from \a __W is taken instead.
1325static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvt2ph_bf8(
1326 __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1327 return (__m256i)__builtin_ia32_selectb_256(
1328 (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
1329}
1330
1331/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1332/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1333/// Zeroing mask \a __U is used to determine if given element should be zeroed
1334/// instead.
1335///
1336/// \code{.operation}
1337/// FOR i := 0 to 31
1338/// IF __U[i]
1339/// IF i < 16
1340/// dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1341/// ELSE
1342/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1343/// FI
1344/// ELSE
1345/// dst.bf8[i] := 0
1346/// FI
1347/// ENDFOR
1348///
1349/// dst[MAX:256] := 0
1350/// \endcode
1351///
1352/// \headerfile <immintrin.h>
1353///
1354/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1355///
1356/// \param __U
1357/// A 32-bit zeroing mask.
1358/// \param __A
1359/// A 256-bit vector of [16 x fp16].
1360/// \param __B
1361/// A 256-bit vector of [16 x fp16].
1362/// \returns
1363/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1364/// (converted) elements from \a __B; higher order elements correspond to the
1365/// (converted) elements from \a __A. If corresponding mask bit is not set,
1366/// zero is taken instead.
1367static __inline__ __m256i __DEFAULT_FN_ATTRS256
1368_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
1369 return (__m256i)__builtin_ia32_selectb_256(
1370 (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
1371 (__v32qi)(__m256i)_mm256_setzero_si256());
1372}
1373
1374/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1375/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1376/// Resulting elements are saturated in case of overflow.
1377///
1378/// \code{.operation}
1379/// FOR i := 0 to 15
1380/// IF i < 8
1381/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1382/// ELSE
1383/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1384/// FI
1385/// ENDFOR
1386///
1387/// dst[MAX:128] := 0
1388/// \endcode
1389///
1390/// \headerfile <immintrin.h>
1391///
1392/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1393///
1394/// \param __A
1395/// A 128-bit vector of [8 x fp16].
1396/// \param __B
1397/// A 128-bit vector of [8 x fp16].
1398/// \returns
1399/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1400/// (converted) elements from \a __B; higher order elements correspond to the
1401/// (converted) elements from \a __A.
1402static __inline__ __m128i __DEFAULT_FN_ATTRS128
1403_mm_cvts_2ph_bf8(__m128h __A, __m128h __B) {
1404 return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A),
1405 (__v8hf)(__B));
1406}
1407
1408/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1409/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1410/// Merging mask \a __U is used to determine if given element should be taken
1411/// from \a __W instead. Resulting elements are saturated in case of overflow.
1412///
1413/// \code{.operation}
1414/// FOR i := 0 to 15
1415/// IF __U[i]
1416/// IF i < 8
1417/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1418/// ELSE
1419/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1420/// FI
1421/// ELSE
1422/// dst.bf8[i] := __W.bf8[i]
1423/// FI
1424/// ENDFOR
1425///
1426/// dst[MAX:128] := 0
1427/// \endcode
1428///
1429/// \headerfile <immintrin.h>
1430///
1431/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1432///
1433/// \param __W
1434/// A 128-bit vector of [16 x bf8].
1435/// \param __U
1436/// A 16-bit merging mask.
1437/// \param __A
1438/// A 128-bit vector of [8 x fp16].
1439/// \param __B
1440/// A 128-bit vector of [8 x fp16].
1441/// \returns
1442/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1443/// (converted) elements from \a __B; higher order elements correspond to the
1444/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1445/// element from \a __W is taken instead.
1446static __inline__ __m128i __DEFAULT_FN_ATTRS128
1447_mm_mask_cvts_2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1448 return (__m128i)__builtin_ia32_selectb_128(
1449 (__mmask16)__U, (__v16qi)_mm_cvts_2ph_bf8(__A, __B), (__v16qi)__W);
1450}
1451
1452/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1453/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1454/// Zeroing mask \a __U is used to determine if given element should be zeroed
1455/// instead. Resulting elements are saturated in case of overflow.
1456///
1457/// \code{.operation}
1458/// FOR i := 0 to 15
1459/// IF __U[i]
1460/// IF i < 8
1461/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1462/// ELSE
1463/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1464/// FI
1465/// ELSE
1466/// dst.bf8[i] := 0
1467/// FI
1468/// ENDFOR
1469///
1470/// dst[MAX:128] := 0
1471/// \endcode
1472///
1473/// \headerfile <immintrin.h>
1474///
1475/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1476///
1477/// \param __U
1478/// A 16-bit zeroing mask.
1479/// \param __A
1480/// A 128-bit vector of [8 x fp16].
1481/// \param __B
1482/// A 128-bit vector of [8 x fp16].
1483/// \returns
1484/// A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1485/// (converted) elements from \a __B; higher order elements correspond to the
1486/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1487/// zero is taken instead.
1488static __inline__ __m128i __DEFAULT_FN_ATTRS128
1489_mm_maskz_cvts_2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
1490 return (__m128i)__builtin_ia32_selectb_128(
1491 (__mmask16)__U, (__v16qi)_mm_cvts_2ph_bf8(__A, __B),
1492 (__v16qi)(__m128i)_mm_setzero_si128());
1493}
1494
1495/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1496/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1497/// Resulting elements are saturated in case of overflow.
1498///
1499/// \code{.operation}
1500/// FOR i := 0 to 31
1501/// IF i < 16
1502/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1503/// ELSE
1504/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1505/// FI
1506/// ENDFOR
1507///
1508/// dst[MAX:256] := 0
1509/// \endcode
1510///
1511/// \headerfile <immintrin.h>
1512///
1513/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1514///
1515/// \param __A
1516/// A 256-bit vector of [16 x fp16].
1517/// \param __B
1518/// A 256-bit vector of [16 x fp16].
1519/// \returns
1520/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1521/// (converted) elements from \a __B; higher order elements correspond to the
1522/// (converted) elements from \a __A.
1523static __inline__ __m256i __DEFAULT_FN_ATTRS256
1524_mm256_cvts_2ph_bf8(__m256h __A, __m256h __B) {
1525 return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A),
1526 (__v16hf)(__B));
1527}
1528
1529/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1530/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1531/// Merging mask \a __U is used to determine if given element should be taken
1532/// from \a __W instead. Resulting elements are saturated in case of overflow.
1533///
1534/// \code{.operation}
1535/// FOR i := 0 to 31
1536/// IF __U[i]
1537/// IF i < 16
1538/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1539/// ELSE
1540/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1541/// FI
1542/// ELSE
1543/// dst.bf8[i] := __W.bf8[i]
1544/// FI
1545/// ENDFOR
1546///
1547/// dst[MAX:256] := 0
1548/// \endcode
1549///
1550/// \headerfile <immintrin.h>
1551///
1552/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1553///
1554/// \param __W
1555/// A 256-bit vector of [32 x bf8].
1556/// \param __U
1557/// A 32-bit merging mask.
1558/// \param __A
1559/// A 256-bit vector of [16 x fp16].
1560/// \param __B
1561/// A 256-bit vector of [16 x fp16].
1562/// \returns
1563/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1564/// (converted) elements from \a __B; higher order elements correspond to the
1565/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1566/// element from \a __W is taken instead.
1567static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_2ph_bf8(
1568 __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1569 return (__m256i)__builtin_ia32_selectb_256(
1570 (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_bf8(__A, __B), (__v32qi)__W);
1571}
1572
1573/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1574/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1575/// Zeroing mask \a __U is used to determine if given element should be zeroed
1576/// instead. Resulting elements are saturated in case of overflow.
1577///
1578/// \code{.operation}
1579/// FOR i := 0 to 31
1580/// IF __U[i]
1581/// IF i < 16
1582/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1583/// ELSE
1584/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1585/// FI
1586/// ELSE
1587/// dst.bf8[i] := 0
1588/// FI
1589/// ENDFOR
1590///
1591/// dst[MAX:256] := 0
1592/// \endcode
1593///
1594/// \headerfile <immintrin.h>
1595///
1596/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1597///
1598/// \param __U
1599/// A 32-bit zeroing mask.
1600/// \param __A
1601/// A 256-bit vector of [16 x fp16].
1602/// \param __B
1603/// A 256-bit vector of [16 x fp16].
1604/// \returns
1605/// A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1606/// (converted) elements from \a __B; higher order elements correspond to the
1607/// (converted) elements from \a __A. If corresponding mask bit is not set,
1608/// zero is taken instead.
1609static __inline__ __m256i __DEFAULT_FN_ATTRS256
1610_mm256_maskz_cvts_2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
1611 return (__m256i)__builtin_ia32_selectb_256(
1612 (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_bf8(__A, __B),
1613 (__v32qi)(__m256i)_mm256_setzero_si256());
1614}
1615
1616/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1617/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1618///
1619/// \code{.operation}
1620/// FOR i := 0 to 15
1621/// IF i < 8
1622/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1623/// ELSE
1624/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1625/// FI
1626/// ENDFOR
1627///
1628/// dst[MAX:128] := 0
1629/// \endcode
1630///
1631/// \headerfile <immintrin.h>
1632///
1633/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1634///
1635/// \param __A
1636/// A 128-bit vector of [8 x fp16].
1637/// \param __B
1638/// A 128-bit vector of [8 x fp16].
1639/// \returns
1640/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1641/// (converted) elements from \a __B; higher order elements correspond to the
1642/// (converted) elements from \a __A.
1643static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A,
1644 __m128h __B) {
1645 return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A),
1646 (__v8hf)(__B));
1647}
1648
1649/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1650/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1651/// Merging mask \a __U is used to determine if given element should be taken
1652/// from \a __W instead.
1653///
1654/// \code{.operation}
1655/// FOR i := 0 to 15
1656/// IF __U[i]
1657/// IF i < 8
1658/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1659/// ELSE
1660/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1661/// FI
1662/// ELSE
1663/// dst.hf8[i] := __W.hf8[i]
1664/// FI
1665/// ENDFOR
1666///
1667/// dst[MAX:128] := 0
1668/// \endcode
1669///
1670/// \headerfile <immintrin.h>
1671///
1672/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1673///
1674/// \param __W
1675/// A 128-bit vector of [16 x hf8].
1676/// \param __U
1677/// A 16-bit merging mask.
1678/// \param __A
1679/// A 128-bit vector of [8 x fp16].
1680/// \param __B
1681/// A 128-bit vector of [8 x fp16].
1682/// \returns
1683/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1684/// (converted) elements from \a __B; higher order elements correspond to the
1685/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1686/// element from \a __W is taken instead.
1687static __inline__ __m128i __DEFAULT_FN_ATTRS128
1688_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1689 return (__m128i)__builtin_ia32_selectb_128(
1690 (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W);
1691}
1692
1693/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1694/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1695/// Zeroing mask \a __U is used to determine if given element should be zeroed
1696/// instead.
1697///
1698/// \code{.operation}
1699/// FOR i := 0 to 15
1700/// IF __U[i]
1701/// IF i < 8
1702/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1703/// ELSE
1704/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1705/// FI
1706/// ELSE
1707/// dst.hf8[i] := 0
1708/// FI
1709/// ENDFOR
1710///
1711/// dst[MAX:128] := 0
1712/// \endcode
1713///
1714/// \headerfile <immintrin.h>
1715///
1716/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1717///
1718/// \param __U
1719/// A 16-bit zeroing mask.
1720/// \param __A
1721/// A 128-bit vector of [8 x fp16].
1722/// \param __B
1723/// A 128-bit vector of [8 x fp16].
1724/// \returns
1725/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1726/// (converted) elements from \a __B; higher order elements correspond to the
1727/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1728/// zero is taken instead.
1729static __inline__ __m128i __DEFAULT_FN_ATTRS128
1730_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
1731 return (__m128i)__builtin_ia32_selectb_128(
1732 (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B),
1733 (__v16qi)(__m128i)_mm_setzero_si128());
1734}
1735
1736/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1737/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1738///
1739/// \code{.operation}
1740/// FOR i := 0 to 31
1741/// IF i < 16
1742/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1743/// ELSE
1744/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1745/// FI
1746/// ENDFOR
1747///
1748/// dst[MAX:256] := 0
1749/// \endcode
1750///
1751/// \headerfile <immintrin.h>
1752///
1753/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1754///
1755/// \param __A
1756/// A 256-bit vector of [16 x fp16].
1757/// \param __B
1758/// A 256-bit vector of [16 x fp16].
1759/// \returns
1760/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1761/// (converted) elements from \a __B; higher order elements correspond to the
1762/// (converted) elements from \a __A.
1763static __inline__ __m256i __DEFAULT_FN_ATTRS256
1764_mm256_cvt2ph_hf8(__m256h __A, __m256h __B) {
1765 return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A),
1766 (__v16hf)(__B));
1767}
1768
1769/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1770/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1771/// Merging mask \a __U is used to determine if given element should be taken
1772/// from \a __W instead.
1773///
1774/// \code{.operation}
1775/// FOR i := 0 to 31
1776/// IF __U[i]
1777/// IF i < 16
1778/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1779/// ELSE
1780/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1781/// FI
1782/// ELSE
1783/// dst.hf8[i] := __W.hf8[i]
1784/// FI
1785/// ENDFOR
1786///
1787/// dst[MAX:256] := 0
1788/// \endcode
1789///
1790/// \headerfile <immintrin.h>
1791///
1792/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1793///
1794/// \param __W
1795/// A 256-bit vector of [32 x hf8].
1796/// \param __U
1797/// A 32-bit merging mask.
1798/// \param __A
1799/// A 256-bit vector of [16 x fp16].
1800/// \param __B
1801/// A 256-bit vector of [16 x fp16].
1802/// \returns
1803/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1804/// (converted) elements from \a __B; higher order elements correspond to the
1805/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1806/// element from \a __W is taken instead.
1807static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvt2ph_hf8(
1808 __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1809 return (__m256i)__builtin_ia32_selectb_256(
1810 (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
1811}
1812
1813/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1814/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1815/// Zeroing mask \a __U is used to determine if given element should be zeroed
1816/// instead.
1817///
1818/// \code{.operation}
1819/// FOR i := 0 to 31
1820/// IF __U[i]
1821/// IF i < 16
1822/// dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1823/// ELSE
1824/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1825/// FI
1826/// ELSE
1827/// dst.hf8[i] := 0
1828/// FI
1829/// ENDFOR
1830///
1831/// dst[MAX:256] := 0
1832/// \endcode
1833///
1834/// \headerfile <immintrin.h>
1835///
1836/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1837///
1838/// \param __U
1839/// A 32-bit zeroing mask.
1840/// \param __A
1841/// A 256-bit vector of [16 x fp16].
1842/// \param __B
1843/// A 256-bit vector of [16 x fp16].
1844/// \returns
1845/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1846/// (converted) elements from \a __B; higher order elements correspond to the
1847/// (converted) elements from \a __A. If corresponding mask bit is not set,
1848/// zero is taken instead.
1849static __inline__ __m256i __DEFAULT_FN_ATTRS256
1850_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
1851 return (__m256i)__builtin_ia32_selectb_256(
1852 (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
1853 (__v32qi)(__m256i)_mm256_setzero_si256());
1854}
1855
1856/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1857/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1858/// Resulting elements are saturated in case of overflow.
1859///
1860/// \code{.operation}
1861/// FOR i := 0 to 15
1862/// IF i < 8
1863/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1864/// ELSE
1865/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1866/// FI
1867/// ENDFOR
1868///
1869/// dst[MAX:128] := 0
1870/// \endcode
1871///
1872/// \headerfile <immintrin.h>
1873///
1874/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1875///
1876/// \param __A
1877/// A 128-bit vector of [8 x fp16].
1878/// \param __B
1879/// A 128-bit vector of [8 x fp16].
1880/// \returns
1881/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1882/// (converted) elements from \a __B; higher order elements correspond to the
1883/// (converted) elements from \a __A.
1884static __inline__ __m128i __DEFAULT_FN_ATTRS128
1885_mm_cvts_2ph_hf8(__m128h __A, __m128h __B) {
1886 return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A),
1887 (__v8hf)(__B));
1888}
1889
1890/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1891/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1892/// Merging mask \a __U is used to determine if given element should be taken
1893/// from \a __W instead. Resulting elements are saturated in case of overflow.
1894///
1895/// \code{.operation}
1896/// FOR i := 0 to 15
1897/// IF __U[i]
1898/// IF i < 8
1899/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1900/// ELSE
1901/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1902/// FI
1903/// ELSE
1904/// dst.hf8[i] := __W.hf8[i]
1905/// FI
1906/// ENDFOR
1907///
1908/// dst[MAX:128] := 0
1909/// \endcode
1910///
1911/// \headerfile <immintrin.h>
1912///
1913/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1914///
1915/// \param __W
1916/// A 128-bit vector of [16 x hf8].
1917/// \param __U
1918/// A 16-bit merging mask.
1919/// \param __A
1920/// A 128-bit vector of [8 x fp16].
1921/// \param __B
1922/// A 128-bit vector of [8 x fp16].
1923/// \returns
1924/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1925/// (converted) elements from \a __B; higher order elements correspond to the
1926/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1927/// element from \a __W is taken instead.
1928static __inline__ __m128i __DEFAULT_FN_ATTRS128
1929_mm_mask_cvts_2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1930 return (__m128i)__builtin_ia32_selectb_128(
1931 (__mmask16)__U, (__v16qi)_mm_cvts_2ph_hf8(__A, __B), (__v16qi)__W);
1932}
1933
1934/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1935/// floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1936/// Zeroing mask \a __U is used to determine if given element should be zeroed
1937/// instead. Resulting elements are saturated in case of overflow.
1938///
1939/// \code{.operation}
1940/// FOR i := 0 to 15
1941/// IF __U[i]
1942/// IF i < 8
1943/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1944/// ELSE
1945/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1946/// FI
1947/// ELSE
1948/// dst.hf8[i] := 0
1949/// FI
1950/// ENDFOR
1951///
1952/// dst[MAX:128] := 0
1953/// \endcode
1954///
1955/// \headerfile <immintrin.h>
1956///
1957/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1958///
1959/// \param __U
1960/// A 16-bit zeroing mask.
1961/// \param __A
1962/// A 128-bit vector of [8 x fp16].
1963/// \param __B
1964/// A 128-bit vector of [8 x fp16].
1965/// \returns
1966/// A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1967/// (converted) elements from \a __B; higher order elements correspond to the
1968/// (converted) elements from \a __A. If corresponding mask bit is not set, then
1969/// zero is taken instead.
1970static __inline__ __m128i __DEFAULT_FN_ATTRS128
1971_mm_maskz_cvts_2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
1972 return (__m128i)__builtin_ia32_selectb_128(
1973 (__mmask16)__U, (__v16qi)_mm_cvts_2ph_hf8(__A, __B),
1974 (__v16qi)(__m128i)_mm_setzero_si128());
1975}
1976
1977/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1978/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1979/// Resulting elements are saturated in case of overflow.
1980///
1981/// \code{.operation}
1982/// FOR i := 0 to 31
1983/// IF i < 16
1984/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1985/// ELSE
1986/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
1987/// FI
1988/// ENDFOR
1989///
1990/// dst[MAX:256] := 0
1991/// \endcode
1992///
1993/// \headerfile <immintrin.h>
1994///
1995/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1996///
1997/// \param __A
1998/// A 256-bit vector of [16 x fp16].
1999/// \param __B
2000/// A 256-bit vector of [16 x fp16].
2001/// \returns
2002/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2003/// (converted) elements from \a __B; higher order elements correspond to the
2004/// (converted) elements from \a __A.
2005static __inline__ __m256i __DEFAULT_FN_ATTRS256
2006_mm256_cvts_2ph_hf8(__m256h __A, __m256h __B) {
2007 return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A),
2008 (__v16hf)(__B));
2009}
2010
2011/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
2012/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
2013/// Merging mask \a __U is used to determine if given element should be taken
2014/// from \a __W instead. Resulting elements are saturated in case of overflow.
2015///
2016/// \code{.operation}
2017/// FOR i := 0 to 31
2018/// IF __U[i]
2019/// IF i < 16
2020/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
2021/// ELSE
2022/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
2023/// FI
2024/// ELSE
2025/// dst.hf8[i] := __W.hf8[i]
2026/// FI
2027/// ENDFOR
2028///
2029/// dst[MAX:256] := 0
2030/// \endcode
2031///
2032/// \headerfile <immintrin.h>
2033///
2034/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
2035///
2036/// \param __W
2037/// A 256-bit vector of [32 x hf8].
2038/// \param __U
2039/// A 32-bit merging mask.
2040/// \param __A
2041/// A 256-bit vector of [16 x fp16].
2042/// \param __B
2043/// A 256-bit vector of [16 x fp16].
2044/// \returns
2045/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2046/// (converted) elements from \a __B; higher order elements correspond to the
2047/// (converted) elements from \a __A. If corresponding mask bit is not set, then
2048/// element from \a __W is taken instead.
2049static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_2ph_hf8(
2050 __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
2051 return (__m256i)__builtin_ia32_selectb_256(
2052 (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_hf8(__A, __B), (__v32qi)__W);
2053}
2054
2055/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
2056/// floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
2057/// Zeroing mask \a __U is used to determine if given element should be zeroed
2058/// instead. Resulting elements are saturated in case of overflow.
2059///
2060/// \code{.operation}
2061/// FOR i := 0 to 31
2062/// IF __U[i]
2063/// IF i < 16
2064/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
2065/// ELSE
2066/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
2067/// FI
2068/// ELSE
2069/// dst.hf8[i] := 0
2070/// FI
2071/// ENDFOR
2072///
2073/// dst[MAX:256] := 0
2074/// \endcode
2075///
2076/// \headerfile <immintrin.h>
2077///
2078/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
2079///
2080/// \param __U
2081/// A 32-bit zeroing mask.
2082/// \param __A
2083/// A 256-bit vector of [16 x fp16].
2084/// \param __B
2085/// A 256-bit vector of [16 x fp16].
2086/// \returns
2087/// A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2088/// (converted) elements from \a __B; higher order elements correspond to the
2089/// (converted) elements from \a __A. If corresponding mask bit is not set,
2090/// zero is taken instead.
2091static __inline__ __m256i __DEFAULT_FN_ATTRS256
2092_mm256_maskz_cvts_2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
2093 return (__m256i)__builtin_ia32_selectb_256(
2094 (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_hf8(__A, __B),
2095 (__v32qi)(__m256i)_mm256_setzero_si256());
2096}
2097
2098/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2099/// elements to a 128-bit vector containing FP16 elements. The conversion is exact.
2100///
2101/// \code{.operation}
2102/// FOR i := 0 to 7
2103/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2104/// ENDFOR
2105///
2106/// dst[MAX:128] := 0
2107/// \endcode
2108///
2109/// \headerfile <immintrin.h>
2110///
2111/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2112///
2113/// \param __A
2114/// A 128-bit vector of [16 x hf8].
2115/// \returns
2116/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2117/// (converted) elements from \a __A.
2118static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8_ph(__m128i __A) {
2119 return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2120 (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1);
2121}
2122
2123/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2124/// elements to a 128-bit vector containing FP16 elements. The conversion is
2125/// exact. Merging mask \a __U is used to determine if given element should be
2126/// taken from \a __W instead.
2127///
2128/// \code{.operation}
2129/// FOR i := 0 to 7
2130/// IF __U[i]
2131/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2132/// ELSE
2133/// dst.fp16[i] := __W.fp16[i]
2134/// FI
2135/// ENDFOR
2136///
2137/// dst[MAX:128] := 0
2138/// \endcode
2139///
2140/// \headerfile <immintrin.h>
2141///
2142/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2143///
2144/// \param __W
2145/// A 128-bit vector of [8 x fp16].
2146/// \param __U
2147/// A 8-bit merging mask.
2148/// \param __A
2149/// A 128-bit vector of [16 x hf8].
2150/// \returns
2151/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2152/// (converted) elements from \a __A. If corresponding mask bit is not set, then
2153/// element from \a __W is taken instead.
2154static __inline__ __m128h __DEFAULT_FN_ATTRS128
2155_mm_mask_cvthf8_ph(__m128h __W, __mmask8 __U, __m128i __A) {
2156 return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2157 (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U);
2158}
2159
2160/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2161/// elements to a 128-bit vector containing FP16 elements. The conversion is
2162/// exact. Zeroing mask \a __U is used to determine if given element should be
2163/// zeroed instead.
2164///
2165/// \code{.operation}
2166/// FOR i := 0 to 7
2167/// IF __U[i]
2168/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2169/// ELSE
2170/// dst.fp16[i] := 0
2171/// FI
2172/// ENDFOR
2173///
2174/// dst[MAX:128] := 0
2175/// \endcode
2176///
2177/// \headerfile <immintrin.h>
2178///
2179/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2180///
2181/// \param __U
2182/// A 8-bit zeroing mask.
2183/// \param __A
2184/// A 128-bit vector of [16 x hf8].
2185/// \returns
2186/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2187/// (converted) elements from \a __A. If corresponding mask bit is not set, then
2188/// zero is taken instead.
2189static __inline__ __m128h __DEFAULT_FN_ATTRS128
2190_mm_maskz_cvthf8_ph(__mmask8 __U, __m128i __A) {
2191 return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2192 (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U);
2193}
2194
2195/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2196/// elements to a 256-bit vector containing FP16 elements. The conversion is exact.
2197///
2198/// \code{.operation}
2199/// FOR i := 0 to 15
2200/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2201/// ENDFOR
2202///
2203/// dst[MAX:256] := 0
2204/// \endcode
2205///
2206/// \headerfile <immintrin.h>
2207///
2208/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2209///
2210/// \param __A
2211/// A 256-bit vector of [32 x hf8].
2212/// \returns
2213/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2214/// (converted) elements from \a __A.
2215static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8_ph(__m128i __A) {
2216 return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2217 (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1);
2218}
2219
2220/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2221/// elements to a 256-bit vector containing FP16 elements. The conversion is
2222/// exact. Merging mask \a __U is used to determine if given element should be
2223/// taken from \a __W instead.
2224///
2225/// \code{.operation}
2226/// FOR i := 0 to 15
2227/// IF __U[i]
2228/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2229/// ELSE
2230/// dst.fp16[i] := __W.fp16[i]
2231/// FI
2232/// ENDFOR
2233///
2234/// dst[MAX:256] := 0
2235/// \endcode
2236///
2237/// \headerfile <immintrin.h>
2238///
2239/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2240///
2241/// \param __W
2242/// A 256-bit vector of [16 x fp16].
2243/// \param __U
2244/// A 16-bit merging mask.
2245/// \param __A
2246/// A 256-bit vector of [32 x hf8].
2247/// \returns
2248/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2249/// (converted) elements from \a __A. If corresponding mask bit is not set, then
2250/// element from \a __W is taken instead.
2251static __inline__ __m256h __DEFAULT_FN_ATTRS256
2252_mm256_mask_cvthf8_ph(__m256h __W, __mmask16 __U, __m128i __A) {
2253 return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2254 (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U);
2255}
2256
2257/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2258/// elements to a 256-bit vector containing FP16 elements. The conversion is
2259/// exact. Zeroing mask \a __U is used to determine if given element should be
2260/// zeroed instead.
2261///
2262/// \code{.operation}
2263/// FOR i := 0 to 15
2264/// IF __U[i]
2265/// dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2266/// ELSE
2267/// dst.fp16[i] := 0
2268/// FI
2269/// ENDFOR
2270///
2271/// dst[MAX:256] := 0
2272/// \endcode
2273///
2274/// \headerfile <immintrin.h>
2275///
2276/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2277///
2278/// \param __U
2279/// A 16-bit zeroing mask.
2280/// \param __A
2281/// A 256-bit vector of [32 x hf8].
2282/// \returns
2283/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2284/// (converted) elements from \a __A. If corresponding mask bit is not set, then
2285/// zero is taken instead.
2286static __inline__ __m256h __DEFAULT_FN_ATTRS256
2287_mm256_maskz_cvthf8_ph(__mmask16 __U, __m128i __A) {
2288 return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2289 (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U);
2290}
2291
2292/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2293/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2294/// resulting vector are zeroed.
2295///
2296/// \code{.operation}
2297/// FOR i := 0 to 7
2298/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2299/// ENDFOR
2300///
2301/// dst[MAX:64] := 0
2302/// \endcode
2303///
2304/// \headerfile <immintrin.h>
2305///
2306/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2307///
2308/// \param __A
2309/// A 128-bit vector of [8 x fp16].
2310/// \returns
2311/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the (converted)
2312/// elements from \a __A; upper elements are zeroed.
2313static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) {
2314 return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2315 (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2316}
2317
2318/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2319/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2320/// resulting vector are zeroed. Merging mask \a __U is used to determine if
2321/// given element should be taken from \a __W instead.
2322///
2323/// \code{.operation}
2324/// FOR i := 0 to 7
2325/// IF __U[i]
2326/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2327/// ELSE
2328/// dst.bf8[i] := __W.bf8[i]
2329/// FI
2330/// ENDFOR
2331///
2332/// dst[MAX:64] := 0
2333/// \endcode
2334///
2335/// \headerfile <immintrin.h>
2336///
2337/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2338///
2339/// \param __W
2340/// A 128-bit vector of [16 x bf8].
2341/// \param __U
2342/// A 8-bit merging mask.
2343/// \param __A
2344/// A 128-bit vector of [8 x fp16].
2345/// \returns
2346/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2347/// (converted) elements from \a __A; upper elements are zeroed. If
2348/// corresponding mask bit is not set, then element from \a __W is taken instead.
2349static __inline__ __m128i __DEFAULT_FN_ATTRS128
2350_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
2351 return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2352 (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2353}
2354
2355/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2356/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2357/// resulting vector are zeroed. Zeroing mask \a __U is used to determine if
2358/// given element should be zeroed instead.
2359///
2360/// \code{.operation}
2361/// FOR i := 0 to 7
2362/// IF __U[i]
2363/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2364/// ELSE
2365/// dst.bf8[i] := 0
2366/// FI
2367/// ENDFOR
2368///
2369/// dst[MAX:64] := 0
2370/// \endcode
2371///
2372/// \headerfile <immintrin.h>
2373///
2374/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2375///
2376/// \param __U
2377/// A 8-bit zeroing mask.
2378/// \param __A
2379/// A 128-bit vector of [8 x fp16].
2380/// \returns
2381/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2382/// (converted) elements from \a __A; upper elements are zeroed. If
2383/// corresponding mask bit is not set, then element is zeroed.
2384static __inline__ __m128i __DEFAULT_FN_ATTRS128
2385_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) {
2386 return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2387 (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2388}
2389
2390/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2391/// to a 128-bit vector containing E5M2 FP8 elements.
2392///
2393/// \code{.operation}
2394/// FOR i := 0 to 15
2395/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2396/// ENDFOR
2397///
2398/// dst[MAX:128] := 0
2399/// \endcode
2400///
2401/// \headerfile <immintrin.h>
2402///
2403/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2404///
2405/// \param __A
2406/// A 256-bit vector of [16 x fp16].
2407/// \returns
2408/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the (converted)
2409/// elements from \a __A.
2410static __inline__ __m128i __DEFAULT_FN_ATTRS256
2411_mm256_cvtph_bf8(__m256h __A) {
2412 return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2413 (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2414}
2415
2416/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2417/// to a 128-bit vector containing E5M2 FP8 elements. Merging mask \a __U is
2418/// used to determine if given element should be taken from \a __W instead.
2419///
2420/// \code{.operation}
2421/// FOR i := 0 to 15
2422/// IF __U[i]
2423/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2424/// ELSE
2425/// dst.bf8[i] := __W.bf8[i]
2426/// FI
2427/// ENDFOR
2428///
2429/// dst[MAX:128] := 0
2430/// \endcode
2431///
2432/// \headerfile <immintrin.h>
2433///
2434/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2435///
2436/// \param __W
2437/// A 128-bit vector of [16 x bf8].
2438/// \param __U
2439/// A 16-bit merging mask.
2440/// \param __A
2441/// A 256-bit vector of [8 x fp16].
2442/// \returns
2443/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2444/// (converted) elements from \a __A. If
2445/// corresponding mask bit is not set, then element from \a __W is taken instead.
2446static __inline__ __m128i __DEFAULT_FN_ATTRS256
2447_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
2448 return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2449 (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2450}
2451
2452/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2453/// to a 128-bit vector containing E5M2 FP8 elements. Zeroing mask \a __U is
2454/// used to determine if given element should be zeroed instead.
2455///
2456/// \code{.operation}
2457/// FOR i := 0 to 15
2458/// IF __U[i]
2459/// dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2460/// ELSE
2461/// dst.bf8[i] := 0
2462/// FI
2463/// ENDFOR
2464///
2465/// dst[MAX:128] := 0
2466/// \endcode
2467///
2468/// \headerfile <immintrin.h>
2469///
2470/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2471///
2472/// \param __U
2473/// A 16-bit zeroing mask.
2474/// \param __A
2475/// A 256-bit vector of [16 x fp16].
2476/// \returns
2477/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2478/// (converted) elements from \a __A. If corresponding mask bit is not set,
2479/// then element is zeroed instead.
2480static __inline__ __m128i __DEFAULT_FN_ATTRS256
2481_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) {
2482 return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2483 (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2484}
2485
2486/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2487/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2488/// resulting vector are zeroed. Results are saturated.
2489///
2490/// \code{.operation}
2491/// FOR i := 0 to 7
2492/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2493/// ENDFOR
2494///
2495/// dst[MAX:64] := 0
2496/// \endcode
2497///
2498/// \headerfile <immintrin.h>
2499///
2500/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2501///
2502/// \param __A
2503/// A 128-bit vector of [8 x fp16].
2504/// \returns
2505/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the (converted)
2506/// elements from \a __A; upper elements are zeroed.
2507static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts_ph_bf8(__m128h __A) {
2508 return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2509 (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2510}
2511
2512/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2513/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2514/// resulting vector are zeroed. Results are saturated. Merging mask \a __U is
2515/// used to determine if given element should be taken from \a __W instead.
2516///
2517/// \code{.operation}
2518/// FOR i := 0 to 7
2519/// IF __U[i]
2520/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2521/// ELSE
2522/// dst.bf8[i] := __W.bf8[i]
2523/// FI
2524/// ENDFOR
2525///
2526/// dst[MAX:64] := 0
2527/// \endcode
2528///
2529/// \headerfile <immintrin.h>
2530///
2531/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2532///
2533/// \param __W
2534/// A 128-bit vector of [16 x bf8].
2535/// \param __U
2536/// A 8-bit merging mask.
2537/// \param __A
2538/// A 128-bit vector of [8 x fp16].
2539/// \returns
2540/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2541/// (converted) elements from \a __A; upper elements are zeroed. If
2542/// corresponding mask bit is not set, then element from \a __W is taken instead.
2543static __inline__ __m128i __DEFAULT_FN_ATTRS128
2544_mm_mask_cvts_ph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
2545 return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2546 (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2547}
2548
2549/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2550/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2551/// resulting vector are zeroed. Results are saturated. Zeroing mask \a __U is
2552/// used to determine if given element should be zeroed instead.
2553///
2554/// \code{.operation}
2555/// FOR i := 0 to 7
2556/// IF __U[i]
2557/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2558/// ELSE
2559/// dst.bf8[i] := 0
2560/// FI
2561/// ENDFOR
2562///
2563/// dst[MAX:64] := 0
2564/// \endcode
2565///
2566/// \headerfile <immintrin.h>
2567///
2568/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2569///
2570/// \param __U
2571/// A 8-bit zeroing mask.
2572/// \param __A
2573/// A 128-bit vector of [8 x fp16].
2574/// \returns
2575/// A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2576/// (converted) elements from \a __A; upper elements are zeroed. If
2577/// corresponding mask bit is not set, then element is zeroed.
2578static __inline__ __m128i __DEFAULT_FN_ATTRS128
2579_mm_maskz_cvts_ph_bf8(__mmask8 __U, __m128h __A) {
2580 return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2581 (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2582}
2583
2584/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2585/// to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2586///
2587/// \code{.operation}
2588/// FOR i := 0 to 15
2589/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2590/// ENDFOR
2591///
2592/// dst[MAX:128] := 0
2593/// \endcode
2594///
2595/// \headerfile <immintrin.h>
2596///
2597/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2598///
2599/// \param __A
2600/// A 256-bit vector of [16 x fp16].
2601/// \returns
2602/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the (converted)
2603/// elements from \a __A.
2604static __inline__ __m128i __DEFAULT_FN_ATTRS256
2605_mm256_cvts_ph_bf8(__m256h __A) {
2606 return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2607 (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2608}
2609
2610/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2611/// to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2612/// Merging mask \a __U is used to determine if given element should be taken
2613/// from \a __W instead.
2614///
2615/// \code{.operation}
2616/// FOR i := 0 to 15
2617/// IF __U[i]
2618/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2619/// ELSE
2620/// dst.bf8[i] := __W.bf8[i]
2621/// FI
2622/// ENDFOR
2623///
2624/// dst[MAX:128] := 0
2625/// \endcode
2626///
2627/// \headerfile <immintrin.h>
2628///
2629/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2630///
2631/// \param __W
2632/// A 128-bit vector of [16 x bf8].
2633/// \param __U
2634/// A 16-bit merging mask.
2635/// \param __A
2636/// A 256-bit vector of [8 x fp16].
2637/// \returns
2638/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2639/// (converted) elements from \a __A. If
2640/// corresponding mask bit is not set, then element from \a __W is taken instead.
2641static __inline__ __m128i __DEFAULT_FN_ATTRS256
2642_mm256_mask_cvts_ph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
2643 return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2644 (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2645}
2646
2647/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2648/// to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2649/// Zeroing mask \a __U is used to determine if given element should be zeroed
2650/// instead.
2651///
2652/// \code{.operation}
2653/// FOR i := 0 to 15
2654/// IF __U[i]
2655/// dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2656/// ELSE
2657/// dst.bf8[i] := 0
2658/// FI
2659/// ENDFOR
2660///
2661/// dst[MAX:128] := 0
2662/// \endcode
2663///
2664/// \headerfile <immintrin.h>
2665///
2666/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2667///
2668/// \param __U
2669/// A 16-bit zeroing mask.
2670/// \param __A
2671/// A 256-bit vector of [16 x fp16].
2672/// \returns
2673/// A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2674/// (converted) elements from \a __A. If corresponding mask bit is not set,
2675/// then element is zeroed instead.
2676static __inline__ __m128i __DEFAULT_FN_ATTRS256
2677_mm256_maskz_cvts_ph_bf8(__mmask16 __U, __m256h __A) {
2678 return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2679 (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2680}
2681
2682/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2683/// to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2684/// resulting vector are zeroed.
2685///
2686/// \code{.operation}
2687/// FOR i := 0 to 7
2688/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2689/// ENDFOR
2690///
2691/// dst[MAX:64] := 0
2692/// \endcode
2693///
2694/// \headerfile <immintrin.h>
2695///
2696/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2697///
2698/// \param __A
2699/// A 128-bit vector of [8 x fp16].
2700/// \returns
2701/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the (converted)
2702/// elements from \a __A; upper elements are zeroed.
2703static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) {
2704 return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2705 (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2706}
2707
2708/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2709/// to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2710/// resulting vector are zeroed. Merging mask \a __U is used to determine if
2711/// given element should be taken from \a __W instead.
2712///
2713/// \code{.operation}
2714/// FOR i := 0 to 7
2715/// IF __U[i]
2716/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2717/// ELSE
2718/// dst.hf8[i] := __W.hf8[i]
2719/// FI
2720/// ENDFOR
2721///
2722/// dst[MAX:64] := 0
2723/// \endcode
2724///
2725/// \headerfile <immintrin.h>
2726///
2727/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2728///
2729/// \param __W
2730/// A 128-bit vector of [16 x hf8].
2731/// \param __U
2732/// A 8-bit merging mask.
2733/// \param __A
2734/// A 128-bit vector of [8 x fp16].
2735/// \returns
2736/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2737/// (converted) elements from \a __A; upper elements are zeroed. If
2738/// corresponding mask bit is not set, then element from \a __W is taken instead.
2739static __inline__ __m128i __DEFAULT_FN_ATTRS128
2740_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
2741 return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2742 (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2743}
2744
2745/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2746/// to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2747/// resulting vector are zeroed. Zeroing mask \a __U is used to determine if
2748/// given element should be zeroed instead.
2749///
2750/// \code{.operation}
2751/// FOR i := 0 to 7
2752/// IF __U[i]
2753/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2754/// ELSE
2755/// dst.hf8[i] := 0
2756/// FI
2757/// ENDFOR
2758///
2759/// dst[MAX:64] := 0
2760/// \endcode
2761///
2762/// \headerfile <immintrin.h>
2763///
2764/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2765///
2766/// \param __U
2767/// A 8-bit zeroing mask.
2768/// \param __A
2769/// A 128-bit vector of [8 x fp16].
2770/// \returns
2771/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2772/// (converted) elements from \a __A; upper elements are zeroed. If
2773/// corresponding mask bit is not set, then element is zeroed.
2774static __inline__ __m128i __DEFAULT_FN_ATTRS128
2775_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) {
2776 return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2777 (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2778}
2779
2780/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2781/// to a 128-bit vector containing E4M3 FP8 elements.
2782///
2783/// \code{.operation}
2784/// FOR i := 0 to 15
2785/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2786/// ENDFOR
2787///
2788/// dst[MAX:128] := 0
2789/// \endcode
2790///
2791/// \headerfile <immintrin.h>
2792///
2793/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2794///
2795/// \param __A
2796/// A 256-bit vector of [16 x fp16].
2797/// \returns
2798/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the (converted)
2799/// elements from \a __A.
2800static __inline__ __m128i __DEFAULT_FN_ATTRS256
2801_mm256_cvtph_hf8(__m256h __A) {
2802 return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2803 (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2804}
2805
2806/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2807/// to a 128-bit vector containing E4M3 FP8 elements. Merging mask \a __U is
2808/// used to determine if given element should be taken from \a __W instead.
2809///
2810/// \code{.operation}
2811/// FOR i := 0 to 15
2812/// IF __U[i]
2813/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2814/// ELSE
2815/// dst.hf8[i] := __W.hf8[i]
2816/// FI
2817/// ENDFOR
2818///
2819/// dst[MAX:128] := 0
2820/// \endcode
2821///
2822/// \headerfile <immintrin.h>
2823///
2824/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2825///
2826/// \param __W
2827/// A 128-bit vector of [16 x hf8].
2828/// \param __U
2829/// A 16-bit merging mask.
2830/// \param __A
2831/// A 256-bit vector of [8 x fp16].
2832/// \returns
2833/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
2834/// (converted) elements from \a __A. If
2835/// corresponding mask bit is not set, then element from \a __W is taken instead.
2836static __inline__ __m128i __DEFAULT_FN_ATTRS256
2837_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
2838 return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2839 (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2840}
2841
2842/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2843/// to a 128-bit vector containing E4M3 FP8 elements. Zeroing mask \a __U is
2844/// used to determine if given element should be zeroed instead.
2845///
2846/// \code{.operation}
2847/// FOR i := 0 to 15
2848/// IF __U[i]
2849/// dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2850/// ELSE
2851/// dst.hf8[i] := 0
2852/// FI
2853/// ENDFOR
2854///
2855/// dst[MAX:128] := 0
2856/// \endcode
2857///
2858/// \headerfile <immintrin.h>
2859///
2860/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2861///
2862/// \param __U
2863/// A 16-bit zeroing mask.
2864/// \param __A
2865/// A 256-bit vector of [16 x fp16].
2866/// \returns
2867/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
2868/// (converted) elements from \a __A. If corresponding mask bit is not set,
2869/// then element is zeroed instead.
2870static __inline__ __m128i __DEFAULT_FN_ATTRS256
2871_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) {
2872 return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2873 (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2874}
2875
2876/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2877/// to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2878/// resulting vector are zeroed. Results are saturated.
2879///
2880/// \code{.operation}
2881/// FOR i := 0 to 7
2882/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2883/// ENDFOR
2884///
2885/// dst[MAX:64] := 0
2886/// \endcode
2887///
2888/// \headerfile <immintrin.h>
2889///
2890/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2891///
2892/// \param __A
2893/// A 128-bit vector of [8 x fp16].
2894/// \returns
2895/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the (converted)
2896/// elements from \a __A; upper elements are zeroed.
2897static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts_ph_hf8(__m128h __A) {
2898 return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2899 (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2900}
2901
2902/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2903/// to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2904/// resulting vector are zeroed. Results are saturated. Merging mask \a __U is
2905/// used to determine if given element should be taken from \a __W instead.
2906///
2907/// \code{.operation}
2908/// FOR i := 0 to 7
2909/// IF __U[i]
2910/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2911/// ELSE
2912/// dst.hf8[i] := __W.hf8[i]
2913/// FI
2914/// ENDFOR
2915///
2916/// dst[MAX:64] := 0
2917/// \endcode
2918///
2919/// \headerfile <immintrin.h>
2920///
2921/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2922///
2923/// \param __W
2924/// A 128-bit vector of [16 x hf8].
2925/// \param __U
2926/// A 8-bit merging mask.
2927/// \param __A
2928/// A 128-bit vector of [8 x fp16].
2929/// \returns
2930/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2931/// (converted) elements from \a __A; upper elements are zeroed. If
2932/// corresponding mask bit is not set, then element from \a __W is taken instead.
2933static __inline__ __m128i __DEFAULT_FN_ATTRS128
2934_mm_mask_cvts_ph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
2935 return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2936 (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2937}
2938
2939/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2940/// to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2941/// resulting vector are zeroed. Results are saturated. Zeroing mask \a __U is
2942/// used to determine if given element should be zeroed instead.
2943///
2944/// \code{.operation}
2945/// FOR i := 0 to 7
2946/// IF __U[i]
2947/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2948/// ELSE
2949/// dst.hf8[i] := 0
2950/// FI
2951/// ENDFOR
2952///
2953/// dst[MAX:64] := 0
2954/// \endcode
2955///
2956/// \headerfile <immintrin.h>
2957///
2958/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2959///
2960/// \param __U
2961/// A 8-bit zeroing mask.
2962/// \param __A
2963/// A 128-bit vector of [8 x fp16].
2964/// \returns
2965/// A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2966/// (converted) elements from \a __A; upper elements are zeroed. If
2967/// corresponding mask bit is not set, then element is zeroed.
2968static __inline__ __m128i __DEFAULT_FN_ATTRS128
2969_mm_maskz_cvts_ph_hf8(__mmask8 __U, __m128h __A) {
2970 return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2971 (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2972}
2973
2974/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2975/// to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
2976///
2977/// \code{.operation}
2978/// FOR i := 0 to 15
2979/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2980/// ENDFOR
2981///
2982/// dst[MAX:128] := 0
2983/// \endcode
2984///
2985/// \headerfile <immintrin.h>
2986///
2987/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2988///
2989/// \param __A
2990/// A 256-bit vector of [16 x fp16].
2991/// \returns
2992/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the (converted)
2993/// elements from \a __A.
2994static __inline__ __m128i __DEFAULT_FN_ATTRS256
2995_mm256_cvts_ph_hf8(__m256h __A) {
2996 return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
2997 (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2998}
2999
3000/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
3001/// to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
3002/// Merging mask \a __U is used to determine if given element should be taken
3003/// from \a __W instead.
3004///
3005/// \code{.operation}
3006/// FOR i := 0 to 15
3007/// IF __U[i]
3008/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
3009/// ELSE
3010/// dst.hf8[i] := __W.hf8[i]
3011/// FI
3012/// ENDFOR
3013///
3014/// dst[MAX:128] := 0
3015/// \endcode
3016///
3017/// \headerfile <immintrin.h>
3018///
3019/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
3020///
3021/// \param __W
3022/// A 128-bit vector of [16 x hf8].
3023/// \param __U
3024/// A 16-bit merging mask.
3025/// \param __A
3026/// A 256-bit vector of [8 x fp16].
3027/// \returns
3028/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
3029/// (converted) elements from \a __A. If
3030/// corresponding mask bit is not set, then element from \a __W is taken instead.
3031static __inline__ __m128i __DEFAULT_FN_ATTRS256
3032_mm256_mask_cvts_ph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
3033 return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
3034 (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
3035}
3036
3037/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
3038/// to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
3039/// Zeroing mask \a __U is used to determine if given element should be zeroed
3040/// instead.
3041///
3042/// \code{.operation}
3043/// FOR i := 0 to 15
3044/// IF __U[i]
3045/// dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
3046/// ELSE
3047/// dst.hf8[i] := 0
3048/// FI
3049/// ENDFOR
3050///
3051/// dst[MAX:128] := 0
3052/// \endcode
3053///
3054/// \headerfile <immintrin.h>
3055///
3056/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
3057///
3058/// \param __U
3059/// A 16-bit zeroing mask.
3060/// \param __A
3061/// A 256-bit vector of [16 x fp16].
3062/// \returns
3063/// A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
3064/// (converted) elements from \a __A. If corresponding mask bit is not set,
3065/// then element is zeroed instead.
3066static __inline__ __m128i __DEFAULT_FN_ATTRS256
3067_mm256_maskz_cvts_ph_hf8(__mmask16 __U, __m256h __A) {
3068 return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
3069 (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
3070}
3071
3072/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3073/// elements to a 128-bit vector containing FP16 elements. The conversion is exact.
3074///
3075/// \code{.operation}
3076/// FOR i := 0 to 7
3077/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3078/// ENDFOR
3079/// \endcode
3080///
3081/// \headerfile <immintrin.h>
3082///
3083/// This intrinsic does not correspond to a single instruction.
3084///
3085/// \param __A
3086/// A 128-bit vector of [16 x bf8].
3087/// \returns
3088/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3089/// (converted) elements from \a __A.
3090static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) {
3091 return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8));
3092}
3093
3094/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3095/// elements to a 128-bit vector containing FP16 elements. The conversion is
3096/// exact. Merging mask \a __U is used to determine if given element should be
3097/// taken from \a __W instead.
3098///
3099/// \code{.operation}
3100/// FOR i := 0 to 7
3101/// IF __U[i]
3102/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3103/// ELSE
3104/// dst.fp16[i] := __W.fp16[i]
3105/// FI
3106/// ENDFOR
3107/// \endcode
3108///
3109/// \headerfile <immintrin.h>
3110///
3111/// This intrinsic does not correspond to a single instruction.
3112///
3113/// \param __W
3114/// A 128-bit vector of [8 x fp16].
3115/// \param __U
3116/// A 8-bit merging mask.
3117/// \param __A
3118/// A 128-bit vector of [16 x bf8].
3119/// \returns
3120/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3121/// (converted) elements from \a __A. If corresponding mask bit is not set, then
3122/// element from \a __W is taken instead.
3123static __inline__ __m128h __DEFAULT_FN_ATTRS128
3124_mm_mask_cvtbf8_ph(__m128h __W, __mmask8 __U, __m128i __A) {
3125 return _mm_castsi128_ph(
3126 _mm_mask_slli_epi16((__m128i)__W, __U, _mm_cvtepi8_epi16(__A), 8));
3127}
3128
3129/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3130/// elements to a 128-bit vector containing FP16 elements. The conversion is
3131/// exact. Zeroing mask \a __U is used to determine if given element should be
3132/// zeroed instead.
3133///
3134/// \code{.operation}
3135/// FOR i := 0 to 7
3136/// IF __U[i]
3137/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3138/// ELSE
3139/// dst.fp16[i] := 0
3140/// FI
3141/// ENDFOR
3142/// \endcode
3143///
3144/// \headerfile <immintrin.h>
3145///
3146/// This intrinsic does not correspond to a single instruction.
3147///
3148/// \param __U
3149/// A 8-bit zeroing mask.
3150/// \param __A
3151/// A 128-bit vector of [16 x bf8].
3152/// \returns
3153/// A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3154/// (converted) elements from \a __A. If corresponding mask bit is not set, then
3155/// zero is taken instead.
3156static __inline__ __m128h __DEFAULT_FN_ATTRS128
3157_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) {
3158 return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8));
3159}
3160
3161/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
3162/// elements to a 256-bit vector containing FP16 elements. The conversion is exact.
3163///
3164/// \code{.operation}
3165/// FOR i := 0 to 15
3166/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3167/// ENDFOR
3168/// \endcode
3169///
3170/// \headerfile <immintrin.h>
3171///
3172/// This intrinsic does not correspond to a single instruction.
3173///
3174/// \param __A
3175/// A 256-bit vector of [32 x bf8].
3176/// \returns
3177/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3178/// (converted) elements from \a __A.
3179static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) {
3180 return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8));
3181}
3182
3183/// Convert 256-bit vector \a __A, containing packed FP8 E5M2 floating-point
3184/// elements to a 256-bit vector containing FP16 elements. The conversion is
3185/// exact. Merging mask \a __U is used to determine if given element should be
3186/// taken from \a __W instead.
3187///
3188/// \code{.operation}
3189/// FOR i := 0 to 15
3190/// IF __U[i]
3191/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3192/// ELSE
3193/// dst.fp16[i] := __W.fp16[i]
3194/// FI
3195/// ENDFOR
3196/// \endcode
3197///
3198/// \headerfile <immintrin.h>
3199///
3200/// This intrinsic does not correspond to a single instruction.
3201///
3202/// \param __W
3203/// A 256-bit vector of [16 x fp16].
3204/// \param __U
3205/// A 16-bit merging mask.
3206/// \param __A
3207/// A 256-bit vector of [32 x bf8].
3208/// \returns
3209/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3210/// (converted) elements from \a __A. If corresponding mask bit is not set, then
3211/// element from \a __W is taken instead.
3212static __inline__ __m256h __DEFAULT_FN_ATTRS256
3213_mm256_mask_cvtbf8_ph(__m256h __W, __mmask16 __U, __m128i __A) {
3214 return _mm256_castsi256_ph(
3215 _mm256_mask_slli_epi16((__m256i)__W, __U, _mm256_cvtepi8_epi16(__A), 8));
3216}
3217
3218/// Convert 256-bit vector \a __A, containing packed FP8 E5M2 floating-point
3219/// elements to a 256-bit vector containing FP16 elements. The conversion is
3220/// exact. Zeroing mask \a __U is used to determine if given element should be
3221/// zeroed instead.
3222///
3223/// \code{.operation}
3224/// FOR i := 0 to 15
3225/// IF __U[i]
3226/// dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3227/// ELSE
3228/// dst.fp16[i] := 0
3229/// FI
3230/// ENDFOR
3231/// \endcode
3232///
3233/// \headerfile <immintrin.h>
3234///
3235/// This intrinsic does not correspond to a single instruction.
3236///
3237/// \param __U
3238/// A 16-bit zeroing mask.
3239/// \param __A
3240/// A 256-bit vector of [32 x bf8].
3241/// \returns
3242/// A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3243/// (converted) elements from \a __A. If corresponding mask bit is not set, then
3244/// zero is taken instead.
3245static __inline__ __m256h __DEFAULT_FN_ATTRS256
3246_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) {
3247 return _mm256_castsi256_ph(
3249}
3250
3251// clang-format on
3252
3253#undef __DEFAULT_FN_ATTRS128
3254#undef __DEFAULT_FN_ATTRS256
3255
3256#endif // __AVX10_2CONVERTINTRIN_H
3257#endif // __SSE2__
#define __DEFAULT_FN_ATTRS128
#define __DEFAULT_FN_ATTRS256
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cvtepi8_epi16(__m128i __V)
Sign-extends bytes from the 128-bit integer vector in __V and returns the 16-bit values in the corres...
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_slli_epi16(__m256i __a, int __count)
Shifts each 16-bit element of the 256-bit vector of [16 x i16] in __a left by __count bits,...
unsigned int __mmask32
unsigned char __mmask8
unsigned short __mmask16
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, unsigned int __B)
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4340
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3878
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition emmintrin.h:3493
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition emmintrin.h:2769
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition smmintrin.h:1225