13#ifndef NO_WARN_X86_INTRINSICS
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
38#if defined(__powerpc64__) && \
39 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
42#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
49#if defined(__STRICT_ANSI__) && \
50 (defined(__cplusplus) || \
51 (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
70typedef vector
float __m128_u
__attribute__((__may_alias__, __aligned__(1)));
73typedef vector
float __v4sf;
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
87 return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94 return ((__m128)
vec_ld(0, (__v4sf *)
__P));
99 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 return (vec_vsx_ld(0,
__P));
105extern __inline __m128
106 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 static const __vector
unsigned char __permute_vector = {
111 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
115 __result = (__m128)
vec_perm(__tmp, __tmp, __permute_vector);
120extern __inline __m128
121 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
126extern __inline __m128
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__,
135_mm_set_ps(
const float __Z,
const float __Y,
const float __X,
const float __W) {
136 return __extension__(__m128)(__v4sf){__W, __X,
__Y, __Z};
140extern __inline __m128
141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
143 return __extension__(__m128)(__v4sf){__Z,
__Y, __X, __W};
148 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
157 *(__m128_u *)
__P = __A;
162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 static const __vector
unsigned char __permute_vector = {
166 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
169 __tmp = (__m128)
vec_perm(__A, __A, __permute_vector);
176 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189extern __inline __m128
190 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
196extern __inline __m128
197 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
201 return (
vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
205extern __inline __m128
206 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 *
__P = ((__v4sf)__A)[0];
222extern __inline __m128
223 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
227 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
239 __A[0] = __A[0] + __B[0];
244extern __inline __m128
245 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
261 __A[0] = __A[0] - __B[0];
266extern __inline __m128
267 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
283 __A[0] = __A[0] * __B[0];
288extern __inline __m128
289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
305 __A[0] = __A[0] / __B[0];
310extern __inline __m128
311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
327extern __inline __m128
328 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
330 return (__m128)((__v4sf)__A + (__v4sf)__B);
333extern __inline __m128
334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336 return (__m128)((__v4sf)__A - (__v4sf)__B);
339extern __inline __m128
340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
342 return (__m128)((__v4sf)__A * (__v4sf)__B);
345extern __inline __m128
346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
348 return (__m128)((__v4sf)__A / (__v4sf)__B);
351extern __inline __m128
352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
354 return (vec_sqrt((__v4sf)__A));
357extern __inline __m128
358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360 return (
vec_re((__v4sf)__A));
363extern __inline __m128
364 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369extern __inline __m128
370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
385extern __inline __m128
386 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
401extern __inline __m128
402 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
418extern __inline __m128
419 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
435extern __inline __m128
436 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
438 __vector __bool
int __m =
vec_cmpgt((__v4sf)__B, (__v4sf)__A);
442extern __inline __m128
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
445 __vector __bool
int __m =
vec_cmpgt((__v4sf)__A, (__v4sf)__B);
450extern __inline __m128
451 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 return ((__m128)
vec_and((__v4sf)__A, (__v4sf)__B));
457extern __inline __m128
458 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 return ((__m128)
vec_andc((__v4sf)__B, (__v4sf)__A));
463extern __inline __m128
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 return ((__m128)
vec_or((__v4sf)__A, (__v4sf)__B));
469extern __inline __m128
470 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472 return ((__m128)
vec_xor((__v4sf)__A, (__v4sf)__B));
478extern __inline __m128
479 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 return ((__m128)
vec_cmpeq((__v4sf)__A, (__v4sf)__B));
484extern __inline __m128
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 return ((__m128)
vec_cmplt((__v4sf)__A, (__v4sf)__B));
490extern __inline __m128
491 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 return ((__m128)
vec_cmple((__v4sf)__A, (__v4sf)__B));
496extern __inline __m128
497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
499 return ((__m128)
vec_cmpgt((__v4sf)__A, (__v4sf)__B));
502extern __inline __m128
503 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 return ((__m128)
vec_cmpge((__v4sf)__A, (__v4sf)__B));
508extern __inline __m128
509 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
511 __v4sf __temp = (__v4sf)
vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512 return ((__m128)
vec_nor(__temp, __temp));
515extern __inline __m128
516 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518 return ((__m128)
vec_cmpge((__v4sf)__A, (__v4sf)__B));
521extern __inline __m128
522 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 return ((__m128)
vec_cmpgt((__v4sf)__A, (__v4sf)__B));
527extern __inline __m128
528 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530 return ((__m128)
vec_cmple((__v4sf)__A, (__v4sf)__B));
533extern __inline __m128
534 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 return ((__m128)
vec_cmplt((__v4sf)__A, (__v4sf)__B));
539extern __inline __m128
540 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542 __vector
unsigned int __a,
__b;
543 __vector
unsigned int __c, __d;
544 static const __vector
unsigned int __float_exp_mask = {
545 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
554extern __inline __m128
555 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557 __vector
unsigned int __a,
__b;
558 __vector
unsigned int __c, __d;
559 static const __vector
unsigned int __float_exp_mask = {
560 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
572extern __inline __m128
573 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
586 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
589extern __inline __m128
590 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
603 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
606extern __inline __m128
607 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
620 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
623extern __inline __m128
624 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
637 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
640extern __inline __m128
641 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
654 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
657extern __inline __m128
658 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
672 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
675extern __inline __m128
676 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
689 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
692extern __inline __m128
693 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
706 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
709extern __inline __m128
710 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
712 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
723 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
726extern __inline __m128
727 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
740 return ((__m128)
vec_sel((__v4sf)__A,
__c, __mask));
743extern __inline __m128
744 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
746 __vector
unsigned int __a,
__b;
747 __vector
unsigned int __c, __d;
748 static const __vector
unsigned int __float_exp_mask = {
749 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
759 return ((__m128)
vec_sel((__v4sf)__A, (__v4sf)
__c, __mask));
762extern __inline __m128
763 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
765 __vector
unsigned int __a,
__b;
766 __vector
unsigned int __c, __d;
767 static const __vector
unsigned int __float_exp_mask = {
768 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769 static const __vector
unsigned int __mask = {0xffffffff, 0, 0, 0};
778 return ((__m128)
vec_sel((__v4sf)__A, (__v4sf)
__c, __mask));
784 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786 return (__A[0] == __B[0]);
790 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792 return (__A[0] < __B[0]);
796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
798 return (__A[0] <= __B[0]);
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
804 return (__A[0] > __B[0]);
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 return (__A[0] >= __B[0]);
814 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
816 return (__A[0] != __B[0]);
828 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 return (__A[0] == __B[0]);
834 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 return (__A[0] < __B[0]);
840 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 return (__A[0] <= __B[0]);
846 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 return (__A[0] > __B[0]);
852 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
854 return (__A[0] >= __B[0]);
858 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
860 return (__A[0] != __B[0]);
864 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
866 return ((__v4sf)__A)[0];
872 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878#ifdef __LITTLE_ENDIAN__
879 "xxsldwi %x0,%x0,%x0,3;\n"
881 "xscvspdp %x2,%x0;\n"
884 :
"+wa"(__A),
"=r"(__res),
"=f"(__dtmp)
887 __res = __builtin_rint(__A[0]);
893 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902extern __inline
long long
903 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_cvtss_si64(__m128 __A) {
906#if defined(_ARCH_PWR8) && defined(__powerpc64__)
909#ifdef __LITTLE_ENDIAN__
910 "xxsldwi %x0,%x0,%x0,3;\n"
912 "xscvspdp %x2,%x0;\n"
915 :
"+wa"(__A),
"=r"(__res),
"=f"(__dtmp)
918 __res = __builtin_llrint(__A[0]);
924extern __inline
long long
925 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_cvtss_si64x(__m128 __A) {
927 return _mm_cvtss_si64((__v4sf)__A);
944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 __builtin_prefetch(
__P);
953 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 __v4sf __temp, __rounded;
957 __vector
unsigned long long __result;
960 __temp = (__v4sf)
vec_splat((__vector
long long)__A, 0);
961 __rounded = vec_rint(__temp);
962 __result = (__vector
unsigned long long)
vec_cts(__rounded, 0);
964 return (__m64)((__vector
long long)__result)[0];
968 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
978 float __temp = __A[0];
984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990extern __inline
long long
991 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvttss_si64(__m128 __A) {
994 float __temp = __A[0];
1000extern __inline
long long
1001 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_cvttss_si64x(__m128 __A) {
1004 float __temp = __A[0];
1011extern __inline __m64
1012 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 __vector
unsigned long long __result;
1018 __temp = (__v4sf)
vec_splat((__vector
long long)__A, 0);
1019 __result = (__vector
unsigned long long)
vec_cts(__temp, 0);
1021 return (__m64)((__vector
long long)__result)[0];
1024extern __inline __m64
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031extern __inline __m128
1032 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040extern __inline __m128
1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048extern __inline __m128
1049 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvtsi64_ss(__m128 __A,
long long __B) {
1058extern __inline __m128
1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvtsi64x_ss(__m128 __A,
long long __B) {
1061 return _mm_cvtsi64_ss(__A, __B);
1066extern __inline __m128
1067 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 __vector
signed int __vm1;
1070 __vector
float __vf1;
1072 __vm1 = (__vector
signed int)(__vector
unsigned long long){__B, __B};
1075 return ((__m128)(__vector
unsigned long long){
1076 ((__vector
unsigned long long)__vf1)[0],
1077 ((__vector
unsigned long long)__A)[1]});
1080extern __inline __m128
1081 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087extern __inline __m128
1088 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 __vector
signed short __vs8;
1091 __vector
signed int __vi4;
1092 __vector
float __vf1;
1094 __vs8 = (__vector
signed short)(__vector
unsigned long long){__A, __A};
1098 return (__m128)__vf1;
1102extern __inline __m128
1103 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105 const __vector
unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106 __vector
unsigned short __vs8;
1107 __vector
unsigned int __vi4;
1108 __vector
float __vf1;
1110 __vs8 = (__vector
unsigned short)(__vector
unsigned long long){__A, __A};
1112#ifdef __LITTLE_ENDIAN__
1119 return (__m128)__vf1;
1123extern __inline __m128
1124 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 __vector
signed char __vc16;
1127 __vector
signed short __vs8;
1128 __vector
signed int __vi4;
1129 __vector
float __vf1;
1131 __vc16 = (__vector
signed char)(__vector
unsigned long long){__A, __A};
1136 return (__m128)__vf1;
1140extern __inline __m128
1141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144 const __vector
unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145 __vector
unsigned char __vc16;
1146 __vector
unsigned short __vs8;
1147 __vector
unsigned int __vi4;
1148 __vector
float __vf1;
1150 __vc16 = (__vector
unsigned char)(__vector
unsigned long long){__A, __A};
1151#ifdef __LITTLE_ENDIAN__
1152 __vs8 = (__vector
unsigned short)
vec_mergel(__vc16, __zero);
1154 (__vector
unsigned int)
vec_mergeh(__vs8, (__vector
unsigned short)__zero);
1156 __vs8 = (__vector
unsigned short)
vec_mergel(__zero, __vc16);
1158 (__vector
unsigned int)
vec_mergeh((__vector
unsigned short)__zero, __vs8);
1162 return (__m128)__vf1;
1166extern __inline __m128
1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1169 __vector
signed int __vi4;
1170 __vector
float __vf4;
1172 __vi4 = (__vector
signed int)(__vector
unsigned long long){__A, __B};
1174 return (__m128)__vf4;
1178extern __inline __m64
1179 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 __vector
signed int __temp;
1183 __vector
unsigned long long __result;
1185 __rounded = vec_rint(__A);
1186 __temp =
vec_cts(__rounded, 0);
1187 __result = (__vector
unsigned long long)
vec_pack(__temp, __temp);
1189 return (__m64)((__vector
long long)__result)[0];
1193extern __inline __m64
1194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 __vector
signed int __tmp_i;
1198 static const __vector
signed int __zero = {0, 0, 0, 0};
1199 __vector
signed short __tmp_s;
1200 __vector
signed char __res_v;
1202 __rounded = vec_rint(__A);
1203 __tmp_i =
vec_cts(__rounded, 0);
1204 __tmp_s =
vec_pack(__tmp_i, __zero);
1205 __res_v =
vec_pack(__tmp_s, __tmp_s);
1206 return (__m64)((__vector
long long)__res_v)[0];
1210extern __inline __m128
1211 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 unsigned long __element_selector_10 = __mask & 0x03;
1215 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218 static const unsigned int __permute_selectors[4] = {
1219#ifdef __LITTLE_ENDIAN__
1220 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1222 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1225 __vector
unsigned int __t;
1227 __t[0] = __permute_selectors[__element_selector_10];
1228 __t[1] = __permute_selectors[__element_selector_32];
1229 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231 return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector
unsigned char)__t);
1235extern __inline __m128
1236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238 return (__m128)
vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1242extern __inline __m128
1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 return (__m128)
vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1250extern __inline __m128
1251 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1270extern __inline __m128
1271 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 return (__m128)
vec_mergel((__vector
unsigned long long)__B,
1274 (__vector
unsigned long long)__A);
1278extern __inline __m128
1279 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 return (__m128)
vec_mergeh((__vector
unsigned long long)__A,
1282 (__vector
unsigned long long)__B);
1287extern __inline __m128
1288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1299 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1314 return vec_extractm((__vector
unsigned int)__A);
1316 __vector
unsigned long long __result;
1317 static const __vector
unsigned int __perm_mask = {
1318#ifdef __LITTLE_ENDIAN__
1319 0x00204060, 0x80808080, 0x80808080, 0x80808080
1321 0x80808080, 0x80808080, 0x80808080, 0x00204060
1325 __result = ((__vector
unsigned long long)vec_vbpermq(
1326 (__vector
unsigned char)__A, (__vector
unsigned char)__perm_mask));
1328#ifdef __LITTLE_ENDIAN__
1338extern __inline __m128
1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1344extern __inline __m128
1345 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354 unsigned int __shiftr = __N & 3;
1355#ifdef __BIG_ENDIAN__
1356 __shiftr = 3 - __shiftr;
1359 return ((__A >> (__shiftr * 16)) & 0xffff);
1363 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _m_pextrw(__m64
const __A,
int const __N) {
1370extern __inline __m64
1371 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 const int __shiftl = (__N & 3) * 16;
1374 const __m64 __shiftD = (
const __m64)
__D << __shiftl;
1375 const __m64 __mask = 0xffffUL << __shiftl;
1376 __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1381extern __inline __m64
1382 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _m_pinsrw(__m64
const __A,
int const __D,
int const __N) {
1388extern __inline __m64
1389 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393 __vector
signed short __a,
__b, __r;
1394 __vector __bool
short __c;
1400 return (__m64)((__vector
long long)__r)[0];
1402 __m64_union __m1, __m2, __res;
1407 __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1409 __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1411 __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1413 __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1416 return (__m64)__res.as_m64;
1420extern __inline __m64
1421 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427extern __inline __m64
1428 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1431 __vector
unsigned char __a,
__b, __r;
1432 __vector __bool
char __c;
1438 return (__m64)((__vector
long long)__r)[0];
1440 __m64_union __m1, __m2, __res;
1446 for (__i = 0; __i < 8; __i++)
1447 __res.as_char[__i] =
1448 ((
unsigned char)__m1.as_char[__i] > (
unsigned char)__m2.as_char[__i])
1450 : __m2.as_char[__i];
1452 return (__m64)__res.as_m64;
1456extern __inline __m64
1457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463extern __inline __m64
1464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1467 __vector
signed short __a,
__b, __r;
1468 __vector __bool
short __c;
1474 return (__m64)((__vector
long long)__r)[0];
1476 __m64_union __m1, __m2, __res;
1481 __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1483 __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1485 __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1487 __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1490 return (__m64)__res.as_m64;
1494extern __inline __m64
1495 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1501extern __inline __m64
1502 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1505 __vector
unsigned char __a,
__b, __r;
1506 __vector __bool
char __c;
1512 return (__m64)((__vector
long long)__r)[0];
1514 __m64_union __m1, __m2, __res;
1520 for (__i = 0; __i < 8; __i++)
1521 __res.as_char[__i] =
1522 ((
unsigned char)__m1.as_char[__i] < (
unsigned char)__m2.as_char[__i])
1524 : __m2.as_char[__i];
1526 return (__m64)__res.as_m64;
1530extern __inline __m64
1531 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1538 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1541 unsigned long long __p =
1542#ifdef __LITTLE_ENDIAN__
1543 0x0008101820283038UL;
1545 0x3830282018100800UL;
1547 return __builtin_bpermd(
__p, __A);
1549#ifdef __LITTLE_ENDIAN__
1550 unsigned int __mask = 0x20283038UL;
1551 unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552 unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1554 unsigned int __mask = 0x38302820UL;
1555 unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556 unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1558 return (__r2 << 4) | __r1;
1563 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1570extern __inline __m64
1571 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1573 __vector
unsigned short __a,
__b;
1574 __vector
unsigned short __c;
1575 __vector
unsigned int __w0, __w1;
1576 __vector
unsigned char __xform1 = {
1577#ifdef __LITTLE_ENDIAN__
1578 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1581 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1589 __w0 = vec_vmuleuh(
__a,
__b);
1590 __w1 = vec_vmulouh(
__a,
__b);
1591 __c = (__vector
unsigned short)
vec_perm(__w0, __w1, __xform1);
1593 return (__m64)((__vector
long long)
__c)[0];
1596extern __inline __m64
1597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1604extern __inline __m64
1605 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1607 unsigned long __element_selector_10 = __N & 0x03;
1608 unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609 unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610 unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611 static const unsigned short __permute_selectors[4] = {
1612#ifdef __LITTLE_ENDIAN__
1613 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1615 0x0607, 0x0405, 0x0203, 0x0001
1619 __vector
unsigned long long __a,
__p, __r;
1621#ifdef __LITTLE_ENDIAN__
1622 __t.as_short[0] = __permute_selectors[__element_selector_10];
1623 __t.as_short[1] = __permute_selectors[__element_selector_32];
1624 __t.as_short[2] = __permute_selectors[__element_selector_54];
1625 __t.as_short[3] = __permute_selectors[__element_selector_76];
1627 __t.as_short[3] = __permute_selectors[__element_selector_10];
1628 __t.as_short[2] = __permute_selectors[__element_selector_32];
1629 __t.as_short[1] = __permute_selectors[__element_selector_54];
1630 __t.as_short[0] = __permute_selectors[__element_selector_76];
1635 return (__m64)((__vector
long long)__r)[0];
1638extern __inline __m64
1639 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1648 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 __m64 __hibit = 0x8080808080808080UL;
1651 __m64 __mask, __tmp;
1652 __m64 *
__p = (__m64 *)
__P;
1656 __tmp = (__tmp & (~__mask)) | (__A & __mask);
1661 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1667extern __inline __m64
1668 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 return (__m64)((__vector
long long)
__c)[0];
1678extern __inline __m64
1679 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1685extern __inline __m64
1686 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1693 return (__m64)((__vector
long long)
__c)[0];
1696extern __inline __m64
1697 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1705extern __inline __m64
1706 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1708 __vector
unsigned char __a,
__b;
1709 __vector
unsigned char __vmin, __vmax, __vabsdiff;
1710 __vector
signed int __vsum;
1711 const __vector
unsigned int __zero = {0, 0, 0, 0};
1712 __m64_union __result = {0};
1714 __a = (__vector
unsigned char)(__vector
unsigned long long){0UL, __A};
1715 __b = (__vector
unsigned char)(__vector
unsigned long long){0UL, __B};
1718 __vabsdiff =
vec_sub(__vmax, __vmin);
1720 __vsum = (__vector
signed int)
vec_sum4s(__vabsdiff, __zero);
1722 __vsum = vec_sums(__vsum, (__vector
signed int)__zero);
1725 __result.as_short[0] = __vsum[3];
1726 return __result.as_m64;
1729extern __inline __m64
1730 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1737 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740 __asm__(
" dcbtstt 0,%0" : :
"b"(
__P) :
"memory");
1746 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1749 __asm__(
" dcbtstt 0,%0" : :
"b"(
__P) :
"memory");
1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1759 __atomic_thread_fence(__ATOMIC_RELEASE);
1768 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783 unsigned long __PPR;
1785 __asm__
volatile(
" mfppr %0;"
1797 __atomic_thread_fence(__ATOMIC_SEQ_CST);
1802#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1804 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1805 __v4sf __t0 = vec_vmrghw(__r0, __r1); \
1806 __v4sf __t1 = vec_vmrghw(__r2, __r3); \
1807 __v4sf __t2 = vec_vmrglw(__r0, __r1); \
1808 __v4sf __t3 = vec_vmrglw(__r2, __r3); \
1809 (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
1810 (__vector long long)__t1); \
1811 (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
1812 (__vector long long)__t1); \
1813 (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
1814 (__vector long long)__t3); \
1815 (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
1816 (__vector long long)__t3); \
1823#include_next <xmmintrin.h>
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
#define vec_ctf(__a, __b)
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
static __inline__ vector float vector float vector float __c
static __inline__ vector float vector float __b
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
static __inline__ uint32_t volatile uint32_t * __p
static __inline__ void int __a
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ void short __D
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
__inline unsigned int unsigned int unsigned int * __P
__inline unsigned int unsigned int __Y
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...