13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
19#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
23#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
27#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
31#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
35#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
39#if !defined(__SCE__) || __has_feature(modules) || \
40 (defined(__SSE4_2__) || defined(__SSE4_1__))
44#if !defined(__SCE__) || __has_feature(modules) || \
45 (defined(__AES__) || defined(__PCLMUL__))
49#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
53#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
57#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
61#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
65#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
72#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
76#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
80#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
84#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
88#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
92#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
96#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
100#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
104#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
108#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
112#if !defined(__SCE__) || __has_feature(modules) || \
113 (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
117#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
121#if !defined(__SCE__) || __has_feature(modules) || \
122 (defined(__AVX512VL__) && defined(__AVX512VNNI__))
126#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
130#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
134#if !defined(__SCE__) || __has_feature(modules) || \
135 (defined(__AVX512VL__) && defined(__AVX512BITALG__))
139#if !defined(__SCE__) || __has_feature(modules) || \
140 (defined(__AVX512VL__) && defined(__AVX512BW__))
144#if !defined(__SCE__) || __has_feature(modules) || \
145 (defined(__AVX512VL__) && defined(__AVX512CD__))
149#if !defined(__SCE__) || __has_feature(modules) || \
150 (defined(__AVX512VL__) && defined(__AVX512DQ__))
154#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
158#if !defined(__SCE__) || __has_feature(modules) || \
159 (defined(__AVX512IFMA__) && defined(__AVX512VL__))
163#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
167#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
171#if !defined(__SCE__) || __has_feature(modules) || \
172 (defined(__AVX512VBMI__) && defined(__AVX512VL__))
176#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
180#if !defined(__SCE__) || __has_feature(modules) || \
181 (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
185#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
189#if !defined(__SCE__) || __has_feature(modules) || \
190 (defined(__AVX512VL__) && defined(__AVX512FP16__))
194#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
198#if !defined(__SCE__) || __has_feature(modules) || \
199 (defined(__AVX512VL__) && defined(__AVX512BF16__))
203#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
207#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
211#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
215#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
219#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
223#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
227#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
231#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
235#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
239#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
243#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
251static __inline__
unsigned int __attribute__((__always_inline__, __nodebug__, __target__(
"rdpid")))
253 return __builtin_ia32_rdpid();
257#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
267static __inline__
int __attribute__((__always_inline__, __nodebug__, __target__(
"rdrnd")))
268_rdrand16_step(
unsigned short *
__p)
270 return (
int)__builtin_ia32_rdrand16_step(
__p);
282static __inline__
int __attribute__((__always_inline__, __nodebug__, __target__(
"rdrnd")))
283_rdrand32_step(
unsigned int *
__p)
285 return (
int)__builtin_ia32_rdrand32_step(
__p);
297static __inline__
int __attribute__((__always_inline__, __nodebug__, __target__(
"rdrnd")))
298_rdrand64_step(
unsigned long long *
__p)
301 return (
int)__builtin_ia32_rdrand64_step(
__p);
305 unsigned int __lo, __hi;
306 unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
307 unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
308 if (__res_lo && __res_hi) {
309 *
__p = ((
unsigned long long)__hi << 32) | (
unsigned long long)__lo;
319#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
328static __inline__
unsigned int __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
331 return __builtin_ia32_rdfsbase32();
341static __inline__
unsigned long long __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
344 return __builtin_ia32_rdfsbase64();
354static __inline__
unsigned int __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
357 return __builtin_ia32_rdgsbase32();
367static __inline__
unsigned long long __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
370 return __builtin_ia32_rdgsbase64();
381static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
382_writefsbase_u32(
unsigned int __V)
384 __builtin_ia32_wrfsbase32(__V);
395static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
396_writefsbase_u64(
unsigned long long __V)
398 __builtin_ia32_wrfsbase64(__V);
409static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
410_writegsbase_u32(
unsigned int __V)
412 __builtin_ia32_wrgsbase32(__V);
423static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"fsgsbase")))
424_writegsbase_u64(
unsigned long long __V)
426 __builtin_ia32_wrgsbase64(__V);
432#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
449static __inline__
short __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
450_loadbe_i16(
void const *
__P) {
454 return (
short)__builtin_bswap16(((
const struct __loadu_i16*)
__P)->
__v);
467static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
469 struct __storeu_i16 {
472 ((
struct __storeu_i16*)
__P)->__v = __builtin_bswap16((
unsigned short)
__D);
484static __inline__
int __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
485_loadbe_i32(
void const *
__P) {
489 return (
int)__builtin_bswap32(((
const struct __loadu_i32*)
__P)->
__v);
502static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
503_storebe_i32(
void *
__P,
int __D) {
504 struct __storeu_i32 {
507 ((
struct __storeu_i32*)
__P)->__v = __builtin_bswap32((
unsigned int)
__D);
520static __inline__
long long __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
521_loadbe_i64(
void const *
__P) {
523 unsigned long long __v;
525 return (
long long)__builtin_bswap64(((
const struct __loadu_i64*)
__P)->
__v);
538static __inline__
void __attribute__((__always_inline__, __nodebug__, __target__(
"movbe")))
539_storebe_i64(
void *
__P,
long long __D) {
540 struct __storeu_i64 {
541 unsigned long long __v;
543 ((
struct __storeu_i64*)
__P)->__v = __builtin_bswap64((
unsigned long long)
__D);
548#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
553#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
557#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
564#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
568#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
572#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
576#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
583#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
587#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
591#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
595#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
599#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
603#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) || \
604 defined(__MOVDIR64B__)
608#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVRS__)
612#if !defined(__SCE__) || __has_feature(modules) || \
613 (defined(__AVX10_2__) && defined(__MOVRS__))
617#if !defined(__SCE__) || __has_feature(modules) || \
618 (defined(__AVX10_2_512__) && defined(__MOVRS__))
622#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
626#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
630#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
634#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
638#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) || \
643#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) || \
644 defined(__AMX_INT8__) || defined(__AMX_BF16__)
648#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
652#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
656#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__)
660#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
664#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
668#if !defined(__SCE__) || __has_feature(modules) || \
669 (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
673#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
677#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
681#if !defined(__SCE__) || __has_feature(modules) || \
682 (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
686#if !defined(__SCE__) || __has_feature(modules) || \
687 (defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__))
691#if !defined(__SCE__) || __has_feature(modules) || \
692 (defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__))
696#if !defined(__SCE__) || __has_feature(modules) || \
697 (defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__))
701#if !defined(__SCE__) || __has_feature(modules) || \
702 defined(__AVX512VP2INTERSECT__)
706#if !defined(__SCE__) || __has_feature(modules) || \
707 (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
711#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
721#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
730#if !defined(__SCE__) || __has_feature(modules) || \
731 (defined(__AVX10_2_512__) && defined(__SM4__))
735#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
739#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
743#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
747#if defined(_MSC_VER) && __has_extension(gnu_asm)
749#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
756#if defined(__i386__) || defined(__x86_64__)
758_InterlockedExchange_HLEAcquire(
long volatile *_Target,
long _Value) {
759 __asm__ __volatile__(
".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
760 :
"+r" (_Value),
"+m" (*_Target) ::
"memory");
764_InterlockedExchange_HLERelease(
long volatile *_Target,
long _Value) {
765 __asm__ __volatile__(
".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
766 :
"+r" (_Value),
"+m" (*_Target) ::
"memory");
770#if defined(__x86_64__)
772_InterlockedExchange64_HLEAcquire(__int64
volatile *_Target, __int64 _Value) {
773 __asm__ __volatile__(
".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
774 :
"+r" (_Value),
"+m" (*_Target) ::
"memory");
778_InterlockedExchange64_HLERelease(__int64
volatile *_Target, __int64 _Value) {
779 __asm__ __volatile__(
".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
780 :
"+r" (_Value),
"+m" (*_Target) ::
"memory");
787#if defined(__i386__) || defined(__x86_64__)
789_InterlockedCompareExchange_HLEAcquire(
long volatile *_Destination,
790 long _Exchange,
long _Comparand) {
791 __asm__ __volatile__(
".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
792 :
"+a" (_Comparand),
"+m" (*_Destination)
793 :
"r" (_Exchange) :
"memory");
797_InterlockedCompareExchange_HLERelease(
long volatile *_Destination,
798 long _Exchange,
long _Comparand) {
799 __asm__ __volatile__(
".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
800 :
"+a" (_Comparand),
"+m" (*_Destination)
801 :
"r" (_Exchange) :
"memory");
805#if defined(__x86_64__)
807_InterlockedCompareExchange64_HLEAcquire(__int64
volatile *_Destination,
808 __int64 _Exchange, __int64 _Comparand) {
809 __asm__ __volatile__(
".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
810 :
"+a" (_Comparand),
"+m" (*_Destination)
811 :
"r" (_Exchange) :
"memory");
815_InterlockedCompareExchange64_HLERelease(__int64
volatile *_Destination,
816 __int64 _Exchange, __int64 _Comparand) {
817 __asm__ __volatile__(
".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
818 :
"+a" (_Comparand),
"+m" (*_Destination)
819 :
"r" (_Exchange) :
"memory");
827#undef __DEFAULT_FN_ATTRS
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
#define __DEFAULT_FN_ATTRS
static __inline__ uint32_t volatile uint32_t * __p
static __inline__ void short __D
struct __storeu_i16 *__P __v
__inline unsigned int unsigned int unsigned int * __P