11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
19#define __DEFAULT_FN_ATTRS_TILE \
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21#define __DEFAULT_FN_ATTRS_INT8 \
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23#define __DEFAULT_FN_ATTRS_BF16 \
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
39static __inline__
void __DEFAULT_FN_ATTRS_TILE
40_tile_loadconfig(
const void *__config) {
41 __builtin_ia32_tile_loadconfig(__config);
55static __inline__
void __DEFAULT_FN_ATTRS_TILE
56_tile_storeconfig(
void *__config) {
57 __builtin_ia32_tile_storeconfig(__config);
66static __inline__
void __DEFAULT_FN_ATTRS_TILE _tile_release(
void) {
67 __builtin_ia32_tilerelease();
84#define _tile_loadd(dst, base, stride) \
85 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
86 (__SIZE_TYPE__)(stride))
104#define _tile_stream_loadd(dst, base, stride) \
105 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
106 (__SIZE_TYPE__)(stride))
122#define _tile_stored(dst, base, stride) \
123 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
133#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
151#define _tile_dpbssd(dst, src0, src1) \
152 __builtin_ia32_tdpbssd((dst), (src0), (src1))
170#define _tile_dpbsud(dst, src0, src1) \
171 __builtin_ia32_tdpbsud((dst), (src0), (src1))
189#define _tile_dpbusd(dst, src0, src1) \
190 __builtin_ia32_tdpbusd((dst), (src0), (src1))
208#define _tile_dpbuud(dst, src0, src1) \
209 __builtin_ia32_tdpbuud((dst), (src0), (src1))
226#define _tile_dpbf16ps(dst, src0, src1) \
227 __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
232typedef int _tile1024i
__attribute__((__vector_size__(1024), __aligned__(64)));
235static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
236_tile_loadd_internal(
unsigned short m,
unsigned short n,
const void *base,
237 __SIZE_TYPE__ stride) {
238 return __builtin_ia32_tileloadd64_internal(m, n, base,
239 (__SIZE_TYPE__)(stride));
243static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
244_tile_loaddt1_internal(
unsigned short m,
unsigned short n,
const void *base,
245 __SIZE_TYPE__ stride) {
246 return __builtin_ia32_tileloaddt164_internal(m, n, base,
247 (__SIZE_TYPE__)(stride));
251static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
252_tile_dpbssd_internal(
unsigned short m,
unsigned short n,
unsigned short k,
253 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
254 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
258static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
259_tile_dpbsud_internal(
unsigned short m,
unsigned short n,
unsigned short k,
260 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
261 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
265static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
266_tile_dpbusd_internal(
unsigned short m,
unsigned short n,
unsigned short k,
267 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
268 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
272static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
273_tile_dpbuud_internal(
unsigned short m,
unsigned short n,
unsigned short k,
274 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
275 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
279static __inline__
void __DEFAULT_FN_ATTRS_TILE
280_tile_stored_internal(
unsigned short m,
unsigned short n,
void *base,
281 __SIZE_TYPE__ stride, _tile1024i tile) {
282 return __builtin_ia32_tilestored64_internal(m, n, base,
283 (__SIZE_TYPE__)(stride), tile);
287static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
288_tile_dpbf16ps_internal(
unsigned short m,
unsigned short n,
unsigned short k,
289 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
290 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
297typedef struct __tile1024i_str {
298 const unsigned short row;
299 const unsigned short col;
316__DEFAULT_FN_ATTRS_TILE
317static __inline__
void __tile_loadd(__tile1024i *dst,
const void *base,
318 __SIZE_TYPE__ stride) {
319 dst->tile = _tile_loadd_internal(
dst->row,
dst->col, base, stride);
337__DEFAULT_FN_ATTRS_TILE
338static __inline__
void __tile_stream_loadd(__tile1024i *dst,
const void *base,
339 __SIZE_TYPE__ stride) {
340 dst->tile = _tile_loaddt1_internal(
dst->row,
dst->col, base, stride);
359__DEFAULT_FN_ATTRS_INT8
360static __inline__
void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
362 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col,
dst->tile,
363 src0.tile, src1.tile);
382__DEFAULT_FN_ATTRS_INT8
383static __inline__
void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
385 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col,
dst->tile,
386 src0.tile, src1.tile);
405__DEFAULT_FN_ATTRS_INT8
406static __inline__
void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
408 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col,
dst->tile,
409 src0.tile, src1.tile);
428__DEFAULT_FN_ATTRS_INT8
429static __inline__
void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
431 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col,
dst->tile,
432 src0.tile, src1.tile);
446__DEFAULT_FN_ATTRS_TILE
447static __inline__
void __tile_stored(
void *base, __SIZE_TYPE__ stride,
449 _tile_stored_internal(src.row, src.col, base, stride, src.tile);
460__DEFAULT_FN_ATTRS_TILE
461static __inline__
void __tile_zero(__tile1024i *dst) {
462 dst->tile = __builtin_ia32_tilezero_internal(
dst->row,
dst->col);
480__DEFAULT_FN_ATTRS_BF16
481static __inline__
void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
483 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col,
dst->tile,
484 src0.tile, src1.tile);
487#undef __DEFAULT_FN_ATTRS_TILE
488#undef __DEFAULT_FN_ATTRS_INT8
489#undef __DEFAULT_FN_ATTRS_BF16
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
const half4 dst(half4 Src0, half4 Src1)