clang 22.0.0git
amxavx512intrin.h
Go to the documentation of this file.
1/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
11#endif // __IMMINTRIN_H
12
13#ifndef __AMX_AVX512INTRIN_H
14#define __AMX_AVX512INTRIN_H
15#if defined(__x86_64__) && defined(__SSE2__)
16
17#define __DEFAULT_FN_ATTRS_AVX512 \
18 __attribute__((__always_inline__, __nodebug__, \
19 __target__("amx-avx512,avx10.2"), __min_vector_width__(512)))
20
21/// Moves a row from a tile register to a zmm destination register, converting
22/// the int32 source elements to fp32. The row of the tile is selected by a
23/// 32b GPR.
24///
25/// \headerfile <x86intrin.h>
26///
27/// \code
28/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
29/// \endcode
30///
31/// \code{.operation}
32/// VL := 512
33/// VL_bytes := VL >> 3
34/// row_index := row & 0xffff
35/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
36/// FOR i := 0 TO (VL_bytes / 4) - 1
37/// IF i + row_chunk / 4 >= tsrc.colsb / 4
38/// dst.dword[i] := 0
39/// ELSE
40/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
41/// FI
42/// ENDFOR
43/// dst[MAX_VL-1:VL] := 0
44/// zero_tileconfig_start()
45/// \endcode
46///
47/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
48///
49/// \param tsrc
50/// The source tile. Max size is 1024 Bytes.
51/// \param row
52/// The row of the source tile
53#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
54
55/// Moves a row from a tile register to a zmm destination register, converting
56/// the int32 source elements to fp32. The row of the tile is selected by a
57/// 8b immediate value.
58///
59/// \headerfile <x86intrin.h>
60///
61/// \code
62/// __m512i _tile_cvtrowd2psi(__tile tsrc, const unsigned int imm8);
63/// \endcode
64///
65/// \code{.operation}
66/// VL := 512
67/// VL_bytes := VL >> 3
68/// row_index := imm8 & 0x3f
69/// row_chunk := (imm8 >> 6) * VL_bytes
70/// FOR i := 0 TO (VL_bytes / 4) - 1
71/// IF i + row_chunk / 4 >= tsrc.colsb / 4
72/// dst.dword[i] := 0
73/// ELSE
74/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
75/// FI
76/// ENDFOR
77/// dst[MAX_VL-1:VL] := 0
78/// zero_tileconfig_start()
79/// \endcode
80///
81/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
82///
83/// \param tsrc
84/// The source tile. Max size is 1024 Bytes.
85/// \param imm8
86/// The row of the source tile
87#define _tile_cvtrowd2psi(tsrc, imm8) __builtin_ia32_tcvtrowd2psi(tsrc, imm8)
88
89/// Moves a row from a tile register to a zmm destination register, converting
90/// the fp32 source elements to bf16. It places the resulting bf16 elements
91/// in the high 16 bits within each dword. The row of the tile is selected
92/// by a 32b GPR.
93///
94/// \headerfile <x86intrin.h>
95///
96/// \code
97/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
98/// \endcode
99///
100/// \code{.operation}
101/// VL := 512
102/// VL_bytes := VL >> 3
103/// row_index := row & 0xffff
104/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
105/// FOR i := 0 TO (VL_bytes / 4) - 1
106/// IF i + row_chunk / 4 >= tsrc.colsb / 4
107/// dst.dword[i] := 0
108/// ELSE
109/// dst.word[2*i+0] := 0
110/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
111/// FI
112/// ENDFOR
113/// dst[MAX_VL-1:VL] := 0
114/// zero_tileconfig_start()
115/// \endcode
116///
117/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
118///
119/// \param tsrc
120/// The source tile. Max size is 1024 Bytes.
121/// \param row
122/// The the row of the source tile.
123#define _tile_cvtrowps2bf16h(tsrc, row) \
124 __builtin_ia32_tcvtrowps2bf16h(tsrc, row)
125
126/// Moves a row from a tile register to a zmm destination register, converting
127/// the fp32 source elements to bf16. It places the resulting bf16 elements
128/// in the high 16 bits within each dword. The row of the tile is selected
129/// by a 8b immediate value.
130///
131/// \headerfile <x86intrin.h>
132///
133/// \code
134/// __m512i _tile_cvtrowps2bf16hi(__tile tsrc, const unsigned int imm8);
135/// \endcode
136///
137/// \code{.operation}
138/// VL := 512
139/// VL_bytes := VL >> 3
140/// row_index := imm8 & 0x3f
141/// row_chunk := (imm8 >> 6) * VL_bytes
142/// FOR i := 0 TO (VL_bytes / 4) - 1
143/// IF i + row_chunk / 4 >= tsrc.colsb / 4
144/// dst.dword[i] := 0
145/// ELSE
146/// dst.word[2*i+0] := 0
147/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
148/// FI
149/// ENDFOR
150/// dst[MAX_VL-1:VL] := 0
151/// zero_tileconfig_start()
152/// \endcode
153///
154/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
155///
156/// \param tsrc
157/// The source tile. Max size is 1024 Bytes.
158/// \param imm8
159/// The the row of the source tile.
160#define _tile_cvtrowps2bf16hi(tsrc, imm8) \
161 __builtin_ia32_tcvtrowps2bf16hi(tsrc, imm8)
162
163/// Moves a row from a tile register to a zmm destination register, converting
164/// the fp32 source elements to bf16. It places the resulting bf16 elements
165/// in the low 16 bits within each dword. The row of the tile is selected
166/// by a 32b GPR.
167///
168/// \headerfile <x86intrin.h>
169///
170/// \code
171/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
172/// \endcode
173///
174/// \code{.operation}
175/// VL := 512
176/// VL_bytes := VL >> 3
177/// row_index := row & 0xffff
178/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
179/// FOR i := 0 TO (VL_bytes / 4) - 1
180/// IF i + row_chunk / 4 >= tsrc.colsb / 4
181/// dst.dword[i] := 0
182/// ELSE
183/// dst.word[2*i+1] := 0
184/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
185/// FI
186/// ENDFOR
187/// dst[MAX_VL-1:VL] := 0
188/// zero_tileconfig_start()
189/// \endcode
190///
191/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
192///
193/// \param tsrc
194/// The source tile. Max size is 1024 Bytes.
195/// \param row
196/// The the row of the source tile.
197#define _tile_cvtrowps2bf16l(tsrc, row) \
198 __builtin_ia32_tcvtrowps2bf16l(tsrc, row)
199
200/// Moves a row from a tile register to a zmm destination register, converting
201/// the fp32 source elements to bf16. It places the resulting bf16 elements
202/// in the low 16 bits within each dword. The row of the tile is selected
203/// by a 8b immediate value.
204///
205/// \headerfile <x86intrin.h>
206///
207/// \code
208/// __m512i _tile_cvtrowps2bf16li(__tile tsrc, const unsigned int imm8);
209/// \endcode
210///
211/// \code{.operation}
212/// VL := 512
213/// VL_bytes := VL >> 3
214/// row_index := imm8 & 0x3f
215/// row_chunk := (imm8 >> 6) * VL_bytes
216/// FOR i := 0 TO (VL_bytes / 4) - 1
217/// IF i + row_chunk / 4 >= tsrc.colsb / 4
218/// dst.dword[i] := 0
219/// ELSE
220/// dst.word[2*i+1] := 0
221/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
222/// FI
223/// ENDFOR
224/// dst[MAX_VL-1:VL] := 0
225/// zero_tileconfig_start()
226/// \endcode
227///
228/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
229///
230/// \param tsrc
231/// The source tile. Max size is 1024 Bytes.
232/// \param imm8
233/// The the row of the source tile.
234#define _tile_cvtrowps2bf16li(tsrc, imm8) \
235 __builtin_ia32_tcvtrowps2bf16li(tsrc, imm8)
236
237/// Moves a row from a tile register to a zmm destination register, converting
238/// the fp32 source elements to fp16. It places the resulting fp16 elements
239/// in the high 16 bits within each dword. The row of the tile is selected
240/// by a 32b GPR.
241///
242/// \headerfile <x86intrin.h>
243///
244/// \code
245/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
246/// \endcode
247///
248/// \code{.operation}
249/// VL := 512
250/// VL_bytes := VL >> 3
251/// row_index := row & 0xffff
252/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
253/// FOR i := 0 TO (VL_bytes / 4) - 1
254/// IF i + row_chunk / 4 >= tsrc.colsb / 4
255/// dst.dword[i] := 0
256/// ELSE
257/// dst.word[2*i+0] := 0
258/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
259/// FI
260/// ENDFOR
261/// dst[MAX_VL-1:VL] := 0
262/// zero_tileconfig_start()
263/// \endcode
264///
265/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
266///
267/// \param tsrc
268/// The source tile. Max size is 1024 Bytes.
269/// \param row
270/// The the row of the source tile.
271#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
272
273/// Moves a row from a tile register to a zmm destination register, converting
274/// the fp32 source elements to fp16. It places the resulting fp16 elements
275/// in the high 16 bits within each dword. The row of the tile is selected
276/// by a 8b immediate value.
277///
278/// \headerfile <x86intrin.h>
279///
280/// \code
281/// __m512i _tile_cvtrowps2phhi(__tile tsrc, constunsigned int imm8);
282/// \endcode
283///
284/// \code{.operation}
285/// VL := 512
286/// VL_bytes := VL >> 3
287/// row_index := imm8 & 0x3f
288/// row_chunk := (imm8 >> 6) * VL_bytes
289/// FOR i := 0 TO (VL_bytes / 4) - 1
290/// IF i + row_chunk / 4 >= tsrc.colsb / 4
291/// dst.dword[i] := 0
292/// ELSE
293/// dst.word[2*i+0] := 0
294/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
295/// FI
296/// ENDFOR
297/// dst[MAX_VL-1:VL] := 0
298/// zero_tileconfig_start()
299/// \endcode
300///
301/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
302///
303/// \param tsrc
304/// The source tile. Max size is 1024 Bytes.
305/// \param imm8
306/// The the row of the source tile.
307#define _tile_cvtrowps2phhi(tsrc, imm8) \
308 __builtin_ia32_tcvtrowps2phhi(tsrc, imm8)
309
310/// Moves a row from a tile register to a zmm destination register, converting
311/// the fp32 source elements to fp16. It places the resulting fp16 elements
312/// in the low 16 bits within each dword. The row of the tile is selected
313/// by a 32b GPR.
314///
315/// \headerfile <x86intrin.h>
316///
317/// \code
318/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
319/// \endcode
320///
321/// \code{.operation}
322/// VL := 512
323/// VL_bytes := VL >> 3
324/// row_index := row & 0xffff
325/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
326/// FOR i := 0 TO (VL_bytes / 4) - 1
327/// IF i + row_chunk / 4 >= tsrc.colsb / 4
328/// dst.dword[i] := 0
329/// ELSE
330/// dst.word[2*i+1] := 0
331/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
332/// FI
333/// ENDFOR
334/// dst[MAX_VL-1:VL] := 0
335/// zero_tileconfig_start()
336/// \endcode
337///
338/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
339///
340/// \param tsrc
341/// The source tile. Max size is 1024 Bytes.
342/// \param row
343/// The the row of the source tile.
344#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
345
346/// Moves a row from a tile register to a zmm destination register, converting
347/// the fp32 source elements to fp16. It places the resulting fp16 elements
348/// in the low 16 bits within each dword. The row of the tile is selected
349/// by a 8b immediate value.
350///
351/// \headerfile <x86intrin.h>
352///
353/// \code
354/// __m512i _tile_cvtrowps2phli(__tile tsrc, const unsigned int imm8);
355/// \endcode
356///
357/// \code{.operation}
358/// VL := 512
359/// VL_bytes := VL >> 3
360/// row_index := imm8 & 0x3f
361/// row_chunk := (imm8 >> 6) * VL_bytes
362/// FOR i := 0 TO (VL_bytes / 4) - 1
363/// IF i + row_chunk / 4 >= tsrc.colsb / 4
364/// dst.dword[i] := 0
365/// ELSE
366/// dst.word[2*i+1] := 0
367/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
368/// FI
369/// ENDFOR
370/// dst[MAX_VL-1:VL] := 0
371/// zero_tileconfig_start()
372/// \endcode
373///
374/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
375///
376/// \param tsrc
377/// The source tile. Max size is 1024 Bytes.
378/// \param imm8
379/// The the row of the source tile.
380#define _tile_cvtrowps2phli(tsrc, imm8) \
381 __builtin_ia32_tcvtrowps2phli(tsrc, imm8)
382
383/// Move one row of a tile data to a v16f32 data.
384/// The row of the tile is selected by a 32b GPR.
385///
386/// \headerfile <immintrin.h>
387///
388/// \code
389/// __m512 _tile_movrow(__tile a, unsigned b);
390/// \endcode
391///
392/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
393///
394/// \param a
395/// The 1st source tile. Max size is 1024 Bytes.
396/// \param b
397/// The 2nd source r32. Size is 4 Bytes.
398/// \returns
399/// The destination v16f32 data. Size is 64 Bytes.
400///
401/// \code{.operation}
402/// VL := 512
403/// VL_bytes := VL>>3
404/// row_index := b&0xffff
405/// row_chunk := ((b>>16)&0xffff) * VL_bytes
406/// FOR i := 0 TO (VL_bytes-1)
407/// IF (row_chunk + i >= a.colsb)
408/// dst.byte[i] := 0
409/// ELSE
410/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
411/// ENDFOR
412/// \endcode
413#define _tile_movrow(a, b) ((__m512i)__builtin_ia32_tilemovrow(a, b))
414
415/// Move one row of a tile data to a v16f32 data.
416/// The row of the tile is selected by a 8b immediate value.
417///
418/// \headerfile <immintrin.h>
419///
420/// \code
421/// __m512 _tile_movrowi(__tile a, const unsigned b);
422/// \endcode
423///
424/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
425///
426/// \param a
427/// The 1st source tile. Max size is 1024 Bytes.
428/// \param b
429/// The 2nd source r32. Size is 4 Bytes.
430/// \returns
431/// The destination v16f32 data. Size is 64 Bytes.
432///
433/// \code{.operation}
434/// VL := 512
435/// VL_bytes := VL>>3
436/// row_index := b&0x3f
437/// row_chunk := (b>>6) * VL_bytes
438/// FOR i := 0 TO (VL_bytes-1)
439/// IF (row_chunk + i >= a.colsb)
440/// dst.byte[i] := 0
441/// ELSE
442/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
443/// ENDFOR
444/// \endcode
445#define _tile_movrowi(a, b) ((__m512i)__builtin_ia32_tilemovrowi(a, b))
446
447/// This is internal intrinsic. C/C++ user should avoid calling it directly.
448
449static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
450 unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
451 return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
452}
453
454static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
455_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
456 _tile1024i src, unsigned u) {
457 return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
458}
459
460static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
461_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
462 _tile1024i src, unsigned u) {
463 return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
464}
465
466static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
467 unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
468 return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
469}
470
471static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
472 unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
473 return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
474}
475
476static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
477 unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
478 return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
479}
480
481/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
482/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
483/// MXCSR.RC=RNE. Embedded rounding is not supported.
484/// The row and chunk elements of tile is fetched from 32bit src1.
485///
486/// \headerfile <immintrin.h>
487///
488/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
489///
490/// \param src0
491/// The 1st source tile. Max size is 1024 Bytes.
492/// \param src1
493/// The 2nd source r32. Size is 4 Bytes.
494/// \returns
495/// The destination v16f32 data. Size is 64 Bytes.
496__DEFAULT_FN_ATTRS_AVX512
497static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
498 return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
499}
500
501/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
502/// elements to bf16 at high 16-bits of each dword.
503/// The row and chunk elements of tile is fetched from 32bit src1.
504///
505/// \headerfile <immintrin.h>
506///
507/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
508///
509/// \param src0
510/// The 1st source tile. Max size is 1024 Bytes.
511/// \param src1
512/// The 2nd source r32. Size is 4 Bytes.
513/// \returns
514/// The destination v32bf16 data. Size is 64 Bytes.
515__DEFAULT_FN_ATTRS_AVX512
516static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
517 return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
518}
519
520/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
521/// elements to bf16 at low 16-bits of each dword.
522/// The row and chunk elements of tile is fetched from 32bit src1.
523///
524/// \headerfile <immintrin.h>
525///
526/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
527///
528/// \param src0
529/// The 1st source tile. Max size is 1024 Bytes.
530/// \param src1
531/// The 2nd source r32. Size is 4 Bytes.
532/// \returns
533/// The destination v32bf16 data. Size is 64 Bytes.
534__DEFAULT_FN_ATTRS_AVX512
535static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
536 return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
537}
538
539/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
540/// elements to fp16 at high 16-bits of each dword.
541/// The row and chunk elements of tile is fetched from 32bit src1.
542///
543/// \headerfile <immintrin.h>
544///
545/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
546///
547/// \param src0
548/// The 1st source tile. Max size is 1024 Bytes.
549/// \param src1
550/// The 2nd source r32. Size is 4 Bytes.
551/// \returns
552/// The destination v32fp16 data. Size is 64 Bytes.
553__DEFAULT_FN_ATTRS_AVX512
554static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
555 return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
556}
557
558/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
559/// elements to fp16 at low 16-bits of each dword.
560/// The row and chunk elements of tile is fetched from 32bit src1.
561///
562/// \headerfile <immintrin.h>
563///
564/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
565///
566/// \param src0
567/// The 1st source tile. Max size is 1024 Bytes.
568/// \param src1
569/// The 2nd source r32. Size is 4 Bytes.
570/// \returns
571/// The destination v32fp16 data. Size is 64 Bytes.
572__DEFAULT_FN_ATTRS_AVX512
573static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
574 return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
575}
576
577/// Move one row of a tile data to a v16f32 data.
578/// The row of the tile is selected by a 32b GPR.
579///
580/// \headerfile <immintrin.h>
581///
582/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
583///
584/// \param src0
585/// The 1st source tile. Max size is 1024 Bytes.
586/// \param src1
587/// The 2nd source r32. Size is 4 Bytes.
588/// \returns
589/// The destination v16i32 data. Size is 64 Bytes.
590__DEFAULT_FN_ATTRS_AVX512
591static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
592 return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
593}
594
595#endif // __x86_64__ && __SSE2__
596#endif // __AMX_AVX512INTRIN_H