clang 20.0.0git
amxintrin.h
Go to the documentation of this file.
1/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12#endif /* __IMMINTRIN_H */
13
14#ifndef __AMXINTRIN_H
15#define __AMXINTRIN_H
16#ifdef __x86_64__
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS_TILE \
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21#define __DEFAULT_FN_ATTRS_INT8 \
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23#define __DEFAULT_FN_ATTRS_BF16 \
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25#define __DEFAULT_FN_ATTRS_FP16 \
26 __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
27
28/// Load tile configuration from a 64-byte memory location specified by
29/// "mem_addr". The tile configuration includes the tile type palette, the
30/// number of bytes per row, and the number of rows. If the specified
31/// palette_id is zero, that signifies the init state for both the tile
32/// config and the tile data, and the tiles are zeroed. Any invalid
33/// configurations will result in #GP fault.
34///
35/// \headerfile <immintrin.h>
36///
37/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
38///
39/// \param __config
40/// A pointer to 512-bits configuration
41static __inline__ void __DEFAULT_FN_ATTRS_TILE
42_tile_loadconfig(const void *__config) {
43 __builtin_ia32_tile_loadconfig(__config);
44}
45
46/// Stores the current tile configuration to a 64-byte memory location
47/// specified by "mem_addr". The tile configuration includes the tile type
48/// palette, the number of bytes per row, and the number of rows. If tiles
49/// are not configured, all zeroes will be stored to memory.
50///
51/// \headerfile <immintrin.h>
52///
53/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
54///
55/// \param __config
56/// A pointer to 512-bits configuration
57static __inline__ void __DEFAULT_FN_ATTRS_TILE
58_tile_storeconfig(void *__config) {
59 __builtin_ia32_tile_storeconfig(__config);
60}
61
62/// Release the tile configuration to return to the init state, which
63/// releases all storage it currently holds.
64///
65/// \headerfile <immintrin.h>
66///
67/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
68static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
69 __builtin_ia32_tilerelease();
70}
71
72/// Load tile rows from memory specifieid by "base" address and "stride" into
73/// destination tile "dst" using the tile configuration previously configured
74/// via "_tile_loadconfig".
75///
76/// \headerfile <immintrin.h>
77///
78/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
79///
80/// \param dst
81/// A destination tile. Max size is 1024 Bytes.
82/// \param base
83/// A pointer to base address.
84/// \param stride
85/// The stride between the rows' data to be loaded in memory.
86#define _tile_loadd(dst, base, stride) \
87 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
88 (__SIZE_TYPE__)(stride))
89
90/// Load tile rows from memory specifieid by "base" address and "stride" into
91/// destination tile "dst" using the tile configuration previously configured
92/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
93/// that the data will likely not be reused in the near future and the data
94/// caching can be optimized accordingly.
95///
96/// \headerfile <immintrin.h>
97///
98/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
99///
100/// \param dst
101/// A destination tile. Max size is 1024 Bytes.
102/// \param base
103/// A pointer to base address.
104/// \param stride
105/// The stride between the rows' data to be loaded in memory.
106#define _tile_stream_loadd(dst, base, stride) \
107 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
108 (__SIZE_TYPE__)(stride))
109
110/// Store the tile specified by "src" to memory specifieid by "base" address and
111/// "stride" using the tile configuration previously configured via
112/// "_tile_loadconfig".
113///
114/// \headerfile <immintrin.h>
115///
116/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
117///
118/// \param dst
119/// A destination tile. Max size is 1024 Bytes.
120/// \param base
121/// A pointer to base address.
122/// \param stride
123/// The stride between the rows' data to be stored in memory.
124#define _tile_stored(dst, base, stride) \
125 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
126
127/// Zero the tile specified by "tdest".
128///
129/// \headerfile <immintrin.h>
130///
131/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
132///
133/// \param tile
134/// The destination tile to be zero. Max size is 1024 Bytes.
135#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
136
137/// Compute dot-product of bytes in tiles with a source/destination accumulator.
138/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
139/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
140/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
141/// and store the 32-bit result back to tile "dst".
142///
143/// \headerfile <immintrin.h>
144///
145/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
146///
147/// \param dst
148/// The destination tile. Max size is 1024 Bytes.
149/// \param src0
150/// The 1st source tile. Max size is 1024 Bytes.
151/// \param src1
152/// The 2nd source tile. Max size is 1024 Bytes.
153#define _tile_dpbssd(dst, src0, src1) \
154 __builtin_ia32_tdpbssd((dst), (src0), (src1))
155
156/// Compute dot-product of bytes in tiles with a source/destination accumulator.
157/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
158/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
159/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
160/// in "dst", and store the 32-bit result back to tile "dst".
161///
162/// \headerfile <immintrin.h>
163///
164/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
165///
166/// \param dst
167/// The destination tile. Max size is 1024 Bytes.
168/// \param src0
169/// The 1st source tile. Max size is 1024 Bytes.
170/// \param src1
171/// The 2nd source tile. Max size is 1024 Bytes.
172#define _tile_dpbsud(dst, src0, src1) \
173 __builtin_ia32_tdpbsud((dst), (src0), (src1))
174
175/// Compute dot-product of bytes in tiles with a source/destination accumulator.
176/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
177/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
178/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
179/// and store the 32-bit result back to tile "dst".
180///
181/// \headerfile <immintrin.h>
182///
183/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
184///
185/// \param dst
186/// The destination tile. Max size is 1024 Bytes.
187/// \param src0
188/// The 1st source tile. Max size is 1024 Bytes.
189/// \param src1
190/// The 2nd source tile. Max size is 1024 Bytes.
191#define _tile_dpbusd(dst, src0, src1) \
192 __builtin_ia32_tdpbusd((dst), (src0), (src1))
193
194/// Compute dot-product of bytes in tiles with a source/destination accumulator.
195/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
196/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
197/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
198/// "dst", and store the 32-bit result back to tile "dst".
199///
200/// \headerfile <immintrin.h>
201///
202/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
203///
204/// \param dst
205/// The destination tile. Max size is 1024 Bytes.
206/// \param src0
207/// The 1st source tile. Max size is 1024 Bytes.
208/// \param src1
209/// The 2nd source tile. Max size is 1024 Bytes.
210#define _tile_dpbuud(dst, src0, src1) \
211 __builtin_ia32_tdpbuud((dst), (src0), (src1))
212
213/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
214/// src1, accumulating the intermediate single-precision (32-bit) floating-point
215/// elements with elements in "dst", and store the 32-bit result back to tile
216/// "dst".
217///
218/// \headerfile <immintrin.h>
219///
220/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
221///
222/// \param dst
223/// The destination tile. Max size is 1024 Bytes.
224/// \param src0
225/// The 1st source tile. Max size is 1024 Bytes.
226/// \param src1
227/// The 2nd source tile. Max size is 1024 Bytes.
228#define _tile_dpbf16ps(dst, src0, src1) \
229 __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
230
231/// AMX tile register size can be configured, the maximum size is 16x64=1024
232/// bytes. Since there is no 2D type in llvm IR, we use vector type to
233/// represent 2D tile and the fixed size is maximum amx tile register size.
234typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
235
236/// This is internal intrinsic. C/C++ user should avoid calling it directly.
237static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
238_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
239 __SIZE_TYPE__ stride) {
240 return __builtin_ia32_tileloadd64_internal(m, n, base,
241 (__SIZE_TYPE__)(stride));
242}
243
244/// This is internal intrinsic. C/C++ user should avoid calling it directly.
245static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
246_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
247 __SIZE_TYPE__ stride) {
248 return __builtin_ia32_tileloaddt164_internal(m, n, base,
249 (__SIZE_TYPE__)(stride));
250}
251
252/// This is internal intrinsic. C/C++ user should avoid calling it directly.
253static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
254_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
255 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
257}
258
259/// This is internal intrinsic. C/C++ user should avoid calling it directly.
260static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
261_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
262 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
264}
265
266/// This is internal intrinsic. C/C++ user should avoid calling it directly.
267static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
268_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
269 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
271}
272
273/// This is internal intrinsic. C/C++ user should avoid calling it directly.
274static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
275_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
276 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
278}
279
280/// This is internal intrinsic. C/C++ user should avoid calling it directly.
281static __inline__ void __DEFAULT_FN_ATTRS_INT8
282_tile_stored_internal(unsigned short m, unsigned short n, void *base,
283 __SIZE_TYPE__ stride, _tile1024i tile) {
284 return __builtin_ia32_tilestored64_internal(m, n, base,
285 (__SIZE_TYPE__)(stride), tile);
286}
287
288/// This is internal intrinsic. C/C++ user should avoid calling it directly.
289static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
290_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
291 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
293}
294
295/// This is internal intrinsic. C/C++ user should avoid calling it directly.
296static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
297_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
298 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
299 return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
300}
301
302/// This struct pack the shape and tile data together for user. We suggest
303/// initializing the struct as early as possible, because compiler depends
304/// on the shape information to do configure. The constant value is preferred
305/// for optimization by compiler.
306typedef struct __tile1024i_str {
307 const unsigned short row;
308 const unsigned short col;
309 _tile1024i tile;
310} __tile1024i;
311
312/// Load tile rows from memory specifieid by "base" address and "stride" into
313/// destination tile "dst".
314///
315/// \headerfile <immintrin.h>
316///
317/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
318///
319/// \param dst
320/// A destination tile. Max size is 1024 Bytes.
321/// \param base
322/// A pointer to base address.
323/// \param stride
324/// The stride between the rows' data to be loaded in memory.
325__DEFAULT_FN_ATTRS_TILE
326static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
327 __SIZE_TYPE__ stride) {
328 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
329}
330
331/// Load tile rows from memory specifieid by "base" address and "stride" into
332/// destination tile "dst". This intrinsic provides a hint to the implementation
333/// that the data will likely not be reused in the near future and the data
334/// caching can be optimized accordingly.
335///
336/// \headerfile <immintrin.h>
337///
338/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
339///
340/// \param dst
341/// A destination tile. Max size is 1024 Bytes.
342/// \param base
343/// A pointer to base address.
344/// \param stride
345/// The stride between the rows' data to be loaded in memory.
346__DEFAULT_FN_ATTRS_TILE
347static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
348 __SIZE_TYPE__ stride) {
349 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
350}
351
352/// Compute dot-product of bytes in tiles with a source/destination accumulator.
353/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356/// and store the 32-bit result back to tile "dst".
357///
358/// \headerfile <immintrin.h>
359///
360/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
361///
362/// \param dst
363/// The destination tile. Max size is 1024 Bytes.
364/// \param src0
365/// The 1st source tile. Max size is 1024 Bytes.
366/// \param src1
367/// The 2nd source tile. Max size is 1024 Bytes.
368__DEFAULT_FN_ATTRS_INT8
369static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
370 __tile1024i src1) {
371 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
372 src0.tile, src1.tile);
373}
374
375/// Compute dot-product of bytes in tiles with a source/destination accumulator.
376/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379/// in "dst", and store the 32-bit result back to tile "dst".
380///
381/// \headerfile <immintrin.h>
382///
383/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
384///
385/// \param dst
386/// The destination tile. Max size is 1024 Bytes.
387/// \param src0
388/// The 1st source tile. Max size is 1024 Bytes.
389/// \param src1
390/// The 2nd source tile. Max size is 1024 Bytes.
391__DEFAULT_FN_ATTRS_INT8
392static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
393 __tile1024i src1) {
394 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
395 src0.tile, src1.tile);
396}
397
398/// Compute dot-product of bytes in tiles with a source/destination accumulator.
399/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402/// and store the 32-bit result back to tile "dst".
403///
404/// \headerfile <immintrin.h>
405///
406/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
407///
408/// \param dst
409/// The destination tile. Max size is 1024 Bytes.
410/// \param src0
411/// The 1st source tile. Max size is 1024 Bytes.
412/// \param src1
413/// The 2nd source tile. Max size is 1024 Bytes.
414__DEFAULT_FN_ATTRS_INT8
415static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
416 __tile1024i src1) {
417 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
418 src0.tile, src1.tile);
419}
420
421/// Compute dot-product of bytes in tiles with a source/destination accumulator.
422/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425/// "dst", and store the 32-bit result back to tile "dst".
426///
427/// \headerfile <immintrin.h>
428///
429/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
430///
431/// \param dst
432/// The destination tile. Max size is 1024 Bytes.
433/// \param src0
434/// The 1st source tile. Max size is 1024 Bytes.
435/// \param src1
436/// The 2nd source tile. Max size is 1024 Bytes.
437__DEFAULT_FN_ATTRS_INT8
438static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
439 __tile1024i src1) {
440 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
441 src0.tile, src1.tile);
442}
443
444/// Store the tile specified by "src" to memory specifieid by "base" address and
445/// "stride".
446///
447/// \headerfile <immintrin.h>
448///
449/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
450///
451/// \param base
452/// A pointer to base address.
453/// \param stride
454/// The stride between the rows' data to be stored in memory.
455__DEFAULT_FN_ATTRS_TILE
456static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
457 __tile1024i src) {
458 _tile_stored_internal(src.row, src.col, base, stride, src.tile);
459}
460
461/// Zero the tile specified by "dst".
462///
463/// \headerfile <immintrin.h>
464///
465/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
466///
467/// \param dst
468/// The destination tile to be zero. Max size is 1024 Bytes.
469__DEFAULT_FN_ATTRS_TILE
470static __inline__ void __tile_zero(__tile1024i *dst) {
471 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
472}
473
474/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475/// src1, accumulating the intermediate single-precision (32-bit) floating-point
476/// elements with elements in "dst", and store the 32-bit result back to tile
477/// "dst".
478///
479/// \headerfile <immintrin.h>
480///
481/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
482///
483/// \param dst
484/// The destination tile. Max size is 1024 Bytes.
485/// \param src0
486/// The 1st source tile. Max size is 1024 Bytes.
487/// \param src1
488/// The 2nd source tile. Max size is 1024 Bytes.
489__DEFAULT_FN_ATTRS_BF16
490static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
491 __tile1024i src1) {
492 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
493 src0.tile, src1.tile);
494}
495
496/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497/// src1, accumulating the intermediate single-precision (32-bit) floating-point
498/// elements with elements in "dst", and store the 32-bit result back to tile
499/// "dst".
500///
501/// \headerfile <immintrin.h>
502///
503/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
504///
505/// \param dst
506/// The destination tile. Max size is 1024 Bytes.
507/// \param src0
508/// The 1st source tile. Max size is 1024 Bytes.
509/// \param src1
510/// The 2nd source tile. Max size is 1024 Bytes.
511__DEFAULT_FN_ATTRS_FP16
512static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
513 __tile1024i src1) {
514 dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
515 src0.tile, src1.tile);
516}
517
518#undef __DEFAULT_FN_ATTRS_TILE
519#undef __DEFAULT_FN_ATTRS_INT8
520#undef __DEFAULT_FN_ATTRS_BF16
521#undef __DEFAULT_FN_ATTRS_FP16
522
523#endif /* __x86_64__ */
524#endif /* __AMXINTRIN_H */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.