clang API Documentation

xmmintrin.h
Go to the documentation of this file.
00001 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023  
00024 #ifndef __XMMINTRIN_H
00025 #define __XMMINTRIN_H
00026  
00027 #ifndef __SSE__
00028 #error "SSE instruction set not enabled"
00029 #else
00030 
00031 #include <mmintrin.h>
00032 
00033 typedef int __v4si __attribute__((__vector_size__(16)));
00034 typedef float __v4sf __attribute__((__vector_size__(16)));
00035 typedef float __m128 __attribute__((__vector_size__(16)));
00036 
00037 // This header should only be included in a hosted environment as it depends on
00038 // a standard library to provide allocation routines.
00039 #if __STDC_HOSTED__
00040 #include <mm_malloc.h>
00041 #endif
00042 
00043 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00044 _mm_add_ss(__m128 a, __m128 b)
00045 {
00046   a[0] += b[0];
00047   return a;
00048 }
00049 
00050 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00051 _mm_add_ps(__m128 a, __m128 b)
00052 {
00053   return a + b;
00054 }
00055 
00056 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00057 _mm_sub_ss(__m128 a, __m128 b)
00058 {
00059   a[0] -= b[0];
00060   return a;
00061 }
00062 
00063 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00064 _mm_sub_ps(__m128 a, __m128 b)
00065 {
00066   return a - b;
00067 }
00068 
00069 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00070 _mm_mul_ss(__m128 a, __m128 b)
00071 {
00072   a[0] *= b[0];
00073   return a;
00074 }
00075 
00076 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00077 _mm_mul_ps(__m128 a, __m128 b)
00078 {
00079   return a * b;
00080 }
00081 
00082 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00083 _mm_div_ss(__m128 a, __m128 b)
00084 {
00085   a[0] /= b[0];
00086   return a;
00087 }
00088 
00089 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00090 _mm_div_ps(__m128 a, __m128 b)
00091 {
00092   return a / b;
00093 }
00094 
00095 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00096 _mm_sqrt_ss(__m128 a)
00097 {
00098   return __builtin_ia32_sqrtss(a);
00099 }
00100 
00101 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00102 _mm_sqrt_ps(__m128 a)
00103 {
00104   return __builtin_ia32_sqrtps(a);
00105 }
00106 
00107 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00108 _mm_rcp_ss(__m128 a)
00109 {
00110   return __builtin_ia32_rcpss(a);
00111 }
00112 
00113 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00114 _mm_rcp_ps(__m128 a)
00115 {
00116   return __builtin_ia32_rcpps(a);
00117 }
00118 
00119 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00120 _mm_rsqrt_ss(__m128 a)
00121 {
00122   return __builtin_ia32_rsqrtss(a);
00123 }
00124 
00125 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00126 _mm_rsqrt_ps(__m128 a)
00127 {
00128   return __builtin_ia32_rsqrtps(a);
00129 }
00130 
00131 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00132 _mm_min_ss(__m128 a, __m128 b)
00133 {
00134   return __builtin_ia32_minss(a, b);
00135 }
00136 
00137 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00138 _mm_min_ps(__m128 a, __m128 b)
00139 {
00140   return __builtin_ia32_minps(a, b);
00141 }
00142 
00143 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00144 _mm_max_ss(__m128 a, __m128 b)
00145 {
00146   return __builtin_ia32_maxss(a, b);
00147 }
00148 
00149 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00150 _mm_max_ps(__m128 a, __m128 b)
00151 {
00152   return __builtin_ia32_maxps(a, b);
00153 }
00154 
00155 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00156 _mm_and_ps(__m128 a, __m128 b)
00157 {
00158   return (__m128)((__v4si)a & (__v4si)b);
00159 }
00160 
00161 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00162 _mm_andnot_ps(__m128 a, __m128 b)
00163 {
00164   return (__m128)(~(__v4si)a & (__v4si)b);
00165 }
00166 
00167 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00168 _mm_or_ps(__m128 a, __m128 b)
00169 {
00170   return (__m128)((__v4si)a | (__v4si)b);
00171 }
00172 
00173 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00174 _mm_xor_ps(__m128 a, __m128 b)
00175 {
00176   return (__m128)((__v4si)a ^ (__v4si)b);
00177 }
00178 
00179 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00180 _mm_cmpeq_ss(__m128 a, __m128 b)
00181 {
00182   return (__m128)__builtin_ia32_cmpss(a, b, 0);
00183 }
00184 
00185 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00186 _mm_cmpeq_ps(__m128 a, __m128 b)
00187 {
00188   return (__m128)__builtin_ia32_cmpps(a, b, 0);
00189 }
00190 
00191 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00192 _mm_cmplt_ss(__m128 a, __m128 b)
00193 {
00194   return (__m128)__builtin_ia32_cmpss(a, b, 1);
00195 }
00196 
00197 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00198 _mm_cmplt_ps(__m128 a, __m128 b)
00199 {
00200   return (__m128)__builtin_ia32_cmpps(a, b, 1);
00201 }
00202 
00203 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00204 _mm_cmple_ss(__m128 a, __m128 b)
00205 {
00206   return (__m128)__builtin_ia32_cmpss(a, b, 2);
00207 }
00208 
00209 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00210 _mm_cmple_ps(__m128 a, __m128 b)
00211 {
00212   return (__m128)__builtin_ia32_cmpps(a, b, 2);
00213 }
00214 
00215 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00216 _mm_cmpgt_ss(__m128 a, __m128 b)
00217 {
00218   return (__m128)__builtin_ia32_cmpss(b, a, 1);
00219 }
00220 
00221 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00222 _mm_cmpgt_ps(__m128 a, __m128 b)
00223 {
00224   return (__m128)__builtin_ia32_cmpps(b, a, 1);
00225 }
00226 
00227 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00228 _mm_cmpge_ss(__m128 a, __m128 b)
00229 {
00230   return (__m128)__builtin_ia32_cmpss(b, a, 2);
00231 }
00232 
00233 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00234 _mm_cmpge_ps(__m128 a, __m128 b)
00235 {
00236   return (__m128)__builtin_ia32_cmpps(b, a, 2);
00237 }
00238 
00239 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00240 _mm_cmpneq_ss(__m128 a, __m128 b)
00241 {
00242   return (__m128)__builtin_ia32_cmpss(a, b, 4);
00243 }
00244 
00245 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00246 _mm_cmpneq_ps(__m128 a, __m128 b)
00247 {
00248   return (__m128)__builtin_ia32_cmpps(a, b, 4);
00249 }
00250 
00251 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00252 _mm_cmpnlt_ss(__m128 a, __m128 b)
00253 {
00254   return (__m128)__builtin_ia32_cmpss(a, b, 5);
00255 }
00256 
00257 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00258 _mm_cmpnlt_ps(__m128 a, __m128 b)
00259 {
00260   return (__m128)__builtin_ia32_cmpps(a, b, 5);
00261 }
00262 
00263 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00264 _mm_cmpnle_ss(__m128 a, __m128 b)
00265 {
00266   return (__m128)__builtin_ia32_cmpss(a, b, 6);
00267 }
00268 
00269 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00270 _mm_cmpnle_ps(__m128 a, __m128 b)
00271 {
00272   return (__m128)__builtin_ia32_cmpps(a, b, 6);
00273 }
00274 
00275 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00276 _mm_cmpngt_ss(__m128 a, __m128 b)
00277 {
00278   return (__m128)__builtin_ia32_cmpss(b, a, 5);
00279 }
00280 
00281 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00282 _mm_cmpngt_ps(__m128 a, __m128 b)
00283 {
00284   return (__m128)__builtin_ia32_cmpps(b, a, 5);
00285 }
00286 
00287 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00288 _mm_cmpnge_ss(__m128 a, __m128 b)
00289 {
00290   return (__m128)__builtin_ia32_cmpss(b, a, 6);
00291 }
00292 
00293 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00294 _mm_cmpnge_ps(__m128 a, __m128 b)
00295 {
00296   return (__m128)__builtin_ia32_cmpps(b, a, 6);
00297 }
00298 
00299 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00300 _mm_cmpord_ss(__m128 a, __m128 b)
00301 {
00302   return (__m128)__builtin_ia32_cmpss(a, b, 7);
00303 }
00304 
00305 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00306 _mm_cmpord_ps(__m128 a, __m128 b)
00307 {
00308   return (__m128)__builtin_ia32_cmpps(a, b, 7);
00309 }
00310 
00311 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00312 _mm_cmpunord_ss(__m128 a, __m128 b)
00313 {
00314   return (__m128)__builtin_ia32_cmpss(a, b, 3);
00315 }
00316 
00317 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00318 _mm_cmpunord_ps(__m128 a, __m128 b)
00319 {
00320   return (__m128)__builtin_ia32_cmpps(a, b, 3);
00321 }
00322 
00323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00324 _mm_comieq_ss(__m128 a, __m128 b)
00325 {
00326   return __builtin_ia32_comieq(a, b);
00327 }
00328 
00329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00330 _mm_comilt_ss(__m128 a, __m128 b)
00331 {
00332   return __builtin_ia32_comilt(a, b);
00333 }
00334 
00335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00336 _mm_comile_ss(__m128 a, __m128 b)
00337 {
00338   return __builtin_ia32_comile(a, b);
00339 }
00340 
00341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00342 _mm_comigt_ss(__m128 a, __m128 b)
00343 {
00344   return __builtin_ia32_comigt(a, b);
00345 }
00346 
00347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00348 _mm_comige_ss(__m128 a, __m128 b)
00349 {
00350   return __builtin_ia32_comige(a, b);
00351 }
00352 
00353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00354 _mm_comineq_ss(__m128 a, __m128 b)
00355 {
00356   return __builtin_ia32_comineq(a, b);
00357 }
00358 
00359 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00360 _mm_ucomieq_ss(__m128 a, __m128 b)
00361 {
00362   return __builtin_ia32_ucomieq(a, b);
00363 }
00364 
00365 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00366 _mm_ucomilt_ss(__m128 a, __m128 b)
00367 {
00368   return __builtin_ia32_ucomilt(a, b);
00369 }
00370 
00371 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00372 _mm_ucomile_ss(__m128 a, __m128 b)
00373 {
00374   return __builtin_ia32_ucomile(a, b);
00375 }
00376 
00377 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00378 _mm_ucomigt_ss(__m128 a, __m128 b)
00379 {
00380   return __builtin_ia32_ucomigt(a, b);
00381 }
00382 
00383 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00384 _mm_ucomige_ss(__m128 a, __m128 b)
00385 {
00386   return __builtin_ia32_ucomige(a, b);
00387 }
00388 
00389 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00390 _mm_ucomineq_ss(__m128 a, __m128 b)
00391 {
00392   return __builtin_ia32_ucomineq(a, b);
00393 }
00394 
00395 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00396 _mm_cvtss_si32(__m128 a)
00397 {
00398   return __builtin_ia32_cvtss2si(a);
00399 }
00400 
00401 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00402 _mm_cvt_ss2si(__m128 a)
00403 {
00404   return _mm_cvtss_si32(a);
00405 }
00406 
00407 #ifdef __x86_64__
00408 
00409 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
00410 _mm_cvtss_si64(__m128 a)
00411 {
00412   return __builtin_ia32_cvtss2si64(a);
00413 }
00414 
00415 #endif
00416 
00417 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00418 _mm_cvtps_pi32(__m128 a)
00419 {
00420   return (__m64)__builtin_ia32_cvtps2pi(a);
00421 }
00422 
00423 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00424 _mm_cvt_ps2pi(__m128 a)
00425 {
00426   return _mm_cvtps_pi32(a);
00427 }
00428 
00429 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00430 _mm_cvttss_si32(__m128 a)
00431 {
00432   return a[0];
00433 }
00434 
00435 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00436 _mm_cvtt_ss2si(__m128 a)
00437 {
00438   return _mm_cvttss_si32(a);
00439 }
00440 
00441 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
00442 _mm_cvttss_si64(__m128 a)
00443 {
00444   return a[0];
00445 }
00446 
00447 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00448 _mm_cvttps_pi32(__m128 a)
00449 {
00450   return (__m64)__builtin_ia32_cvttps2pi(a);
00451 }
00452 
00453 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00454 _mm_cvtt_ps2pi(__m128 a)
00455 {
00456   return _mm_cvttps_pi32(a);
00457 }
00458 
00459 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00460 _mm_cvtsi32_ss(__m128 a, int b)
00461 {
00462   a[0] = b;
00463   return a;
00464 }
00465 
00466 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00467 _mm_cvt_si2ss(__m128 a, int b)
00468 {
00469   return _mm_cvtsi32_ss(a, b);
00470 }
00471 
00472 #ifdef __x86_64__
00473 
00474 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00475 _mm_cvtsi64_ss(__m128 a, long long b)
00476 {
00477   a[0] = b;
00478   return a;
00479 }
00480 
00481 #endif
00482 
00483 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00484 _mm_cvtpi32_ps(__m128 a, __m64 b)
00485 {
00486   return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
00487 }
00488 
00489 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00490 _mm_cvt_pi2ps(__m128 a, __m64 b)
00491 {
00492   return _mm_cvtpi32_ps(a, b);
00493 }
00494 
00495 static __inline__ float __attribute__((__always_inline__, __nodebug__))
00496 _mm_cvtss_f32(__m128 a)
00497 {
00498   return a[0];
00499 }
00500 
00501 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00502 _mm_loadh_pi(__m128 a, const __m64 *p)
00503 {
00504   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
00505   struct __mm_loadh_pi_struct {
00506     __mm_loadh_pi_v2f32 u;
00507   } __attribute__((__packed__, __may_alias__));
00508   __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u;
00509   __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
00510   return __builtin_shufflevector(a, bb, 0, 1, 4, 5);
00511 }
00512 
00513 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00514 _mm_loadl_pi(__m128 a, const __m64 *p)
00515 {
00516   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
00517   struct __mm_loadl_pi_struct {
00518     __mm_loadl_pi_v2f32 u;
00519   } __attribute__((__packed__, __may_alias__));
00520   __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u;
00521   __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1);
00522   return __builtin_shufflevector(a, bb, 4, 5, 2, 3);
00523 }
00524 
00525 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00526 _mm_load_ss(const float *p)
00527 {
00528   struct __mm_load_ss_struct {
00529     float u;
00530   } __attribute__((__packed__, __may_alias__));
00531   float u = ((struct __mm_load_ss_struct*)p)->u;
00532   return (__m128){ u, 0, 0, 0 };
00533 }
00534 
00535 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00536 _mm_load1_ps(const float *p)
00537 {
00538   struct __mm_load1_ps_struct {
00539     float u;
00540   } __attribute__((__packed__, __may_alias__));
00541   float u = ((struct __mm_load1_ps_struct*)p)->u;
00542   return (__m128){ u, u, u, u };
00543 }
00544 
00545 #define        _mm_load_ps1(p) _mm_load1_ps(p)
00546 
00547 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00548 _mm_load_ps(const float *p)
00549 {
00550   return *(__m128*)p;
00551 }
00552 
00553 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00554 _mm_loadu_ps(const float *p)
00555 {
00556   struct __loadu_ps {
00557     __m128 v;
00558   } __attribute__((__packed__, __may_alias__));
00559   return ((struct __loadu_ps*)p)->v;
00560 }
00561 
00562 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00563 _mm_loadr_ps(const float *p)
00564 {
00565   __m128 a = _mm_load_ps(p);
00566   return __builtin_shufflevector(a, a, 3, 2, 1, 0);
00567 }
00568 
00569 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00570 _mm_set_ss(float w)
00571 {
00572   return (__m128){ w, 0, 0, 0 };
00573 }
00574 
00575 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00576 _mm_set1_ps(float w)
00577 {
00578   return (__m128){ w, w, w, w };
00579 }
00580 
00581 // Microsoft specific.
00582 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00583 _mm_set_ps1(float w)
00584 {
00585     return _mm_set1_ps(w);
00586 }
00587 
00588 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00589 _mm_set_ps(float z, float y, float x, float w)
00590 {
00591   return (__m128){ w, x, y, z };
00592 }
00593 
00594 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00595 _mm_setr_ps(float z, float y, float x, float w)
00596 {
00597   return (__m128){ z, y, x, w };
00598 }
00599 
00600 static __inline__ __m128 __attribute__((__always_inline__))
00601 _mm_setzero_ps(void)
00602 {
00603   return (__m128){ 0, 0, 0, 0 };
00604 }
00605 
00606 static __inline__ void __attribute__((__always_inline__))
00607 _mm_storeh_pi(__m64 *p, __m128 a)
00608 {
00609   __builtin_ia32_storehps((__v2si *)p, a);
00610 }
00611 
00612 static __inline__ void __attribute__((__always_inline__))
00613 _mm_storel_pi(__m64 *p, __m128 a)
00614 {
00615   __builtin_ia32_storelps((__v2si *)p, a);
00616 }
00617 
00618 static __inline__ void __attribute__((__always_inline__))
00619 _mm_store_ss(float *p, __m128 a)
00620 {
00621   struct __mm_store_ss_struct {
00622     float u;
00623   } __attribute__((__packed__, __may_alias__));
00624   ((struct __mm_store_ss_struct*)p)->u = a[0];
00625 }
00626 
00627 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00628 _mm_storeu_ps(float *p, __m128 a)
00629 {
00630   __builtin_ia32_storeups(p, a);
00631 }
00632 
00633 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00634 _mm_store1_ps(float *p, __m128 a)
00635 {
00636   a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
00637   _mm_storeu_ps(p, a);
00638 }
00639 
00640 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00641 _mm_store_ps1(float *p, __m128 a)
00642 {
00643     return _mm_store1_ps(p, a);
00644 }
00645 
00646 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00647 _mm_store_ps(float *p, __m128 a)
00648 {
00649   *(__m128 *)p = a;
00650 }
00651 
00652 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00653 _mm_storer_ps(float *p, __m128 a)
00654 {
00655   a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
00656   _mm_store_ps(p, a);
00657 }
00658 
00659 #define _MM_HINT_T0 3
00660 #define _MM_HINT_T1 2
00661 #define _MM_HINT_T2 1
00662 #define _MM_HINT_NTA 0
00663 
00664 /* FIXME: We have to #define this because "sel" must be a constant integer, and
00665    Sema doesn't do any form of constant propagation yet. */
00666 
00667 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
00668 
00669 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00670 _mm_stream_pi(__m64 *p, __m64 a)
00671 {
00672   __builtin_ia32_movntq(p, a);
00673 }
00674 
00675 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00676 _mm_stream_ps(float *p, __m128 a)
00677 {
00678   __builtin_ia32_movntps(p, a);
00679 }
00680 
00681 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00682 _mm_sfence(void)
00683 {
00684   __builtin_ia32_sfence();
00685 }
00686 
00687 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00688 _mm_extract_pi16(__m64 a, int n)
00689 {
00690   __v4hi b = (__v4hi)a;
00691   return (unsigned short)b[n & 3];
00692 }
00693 
00694 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00695 _mm_insert_pi16(__m64 a, int d, int n)
00696 {
00697    __v4hi b = (__v4hi)a;
00698    b[n & 3] = d;
00699    return (__m64)b;
00700 }
00701 
00702 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00703 _mm_max_pi16(__m64 a, __m64 b)
00704 {
00705   return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
00706 }
00707 
00708 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00709 _mm_max_pu8(__m64 a, __m64 b)
00710 {
00711   return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
00712 }
00713 
00714 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00715 _mm_min_pi16(__m64 a, __m64 b)
00716 {
00717   return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
00718 }
00719 
00720 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00721 _mm_min_pu8(__m64 a, __m64 b)
00722 {
00723   return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
00724 }
00725 
00726 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00727 _mm_movemask_pi8(__m64 a)
00728 {
00729   return __builtin_ia32_pmovmskb((__v8qi)a);
00730 }
00731 
00732 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00733 _mm_mulhi_pu16(__m64 a, __m64 b)
00734 {
00735   return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);  
00736 }
00737 
00738 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
00739   __m64 __a = (a); \
00740   (__m64)__builtin_ia32_pshufw((__v4hi)__a, (n)); })
00741 
00742 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00743 _mm_maskmove_si64(__m64 d, __m64 n, char *p)
00744 {
00745   __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
00746 }
00747 
00748 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00749 _mm_avg_pu8(__m64 a, __m64 b)
00750 {
00751   return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
00752 }
00753 
00754 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00755 _mm_avg_pu16(__m64 a, __m64 b)
00756 {
00757   return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
00758 }
00759 
00760 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00761 _mm_sad_pu8(__m64 a, __m64 b)
00762 {
00763   return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
00764 }
00765 
00766 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
00767 _mm_getcsr(void)
00768 {
00769   return __builtin_ia32_stmxcsr();
00770 }
00771 
00772 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00773 _mm_setcsr(unsigned int i)
00774 {
00775   __builtin_ia32_ldmxcsr(i);
00776 }
00777 
00778 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
00779   __m128 __a = (a); \
00780   __m128 __b = (b); \
00781   (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__b, \
00782                                   (mask) & 0x3, ((mask) & 0xc) >> 2, \
00783                                   (((mask) & 0x30) >> 4) + 4, \
00784                                   (((mask) & 0xc0) >> 6) + 4); })
00785 
00786 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00787 _mm_unpackhi_ps(__m128 a, __m128 b)
00788 {
00789   return __builtin_shufflevector(a, b, 2, 6, 3, 7);
00790 }
00791 
00792 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00793 _mm_unpacklo_ps(__m128 a, __m128 b)
00794 {
00795   return __builtin_shufflevector(a, b, 0, 4, 1, 5);
00796 }
00797 
00798 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00799 _mm_move_ss(__m128 a, __m128 b)
00800 {
00801   return __builtin_shufflevector(a, b, 4, 1, 2, 3);
00802 }
00803 
00804 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00805 _mm_movehl_ps(__m128 a, __m128 b)
00806 {
00807   return __builtin_shufflevector(a, b, 6, 7, 2, 3);
00808 }
00809 
00810 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00811 _mm_movelh_ps(__m128 a, __m128 b)
00812 {
00813   return __builtin_shufflevector(a, b, 0, 1, 4, 5);
00814 }
00815 
00816 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00817 _mm_cvtpi16_ps(__m64 a)
00818 {
00819   __m64 b, c;
00820   __m128 r;
00821 
00822   b = _mm_setzero_si64();
00823   b = _mm_cmpgt_pi16(b, a);
00824   c = _mm_unpackhi_pi16(a, b);  
00825   r = _mm_setzero_ps();
00826   r = _mm_cvtpi32_ps(r, c);
00827   r = _mm_movelh_ps(r, r);
00828   c = _mm_unpacklo_pi16(a, b);  
00829   r = _mm_cvtpi32_ps(r, c);
00830 
00831   return r;
00832 }
00833 
00834 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00835 _mm_cvtpu16_ps(__m64 a)
00836 {
00837   __m64 b, c;
00838   __m128 r;
00839 
00840   b = _mm_setzero_si64();
00841   c = _mm_unpackhi_pi16(a, b);  
00842   r = _mm_setzero_ps();
00843   r = _mm_cvtpi32_ps(r, c);
00844   r = _mm_movelh_ps(r, r);
00845   c = _mm_unpacklo_pi16(a, b);  
00846   r = _mm_cvtpi32_ps(r, c);
00847 
00848   return r;
00849 }
00850 
00851 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00852 _mm_cvtpi8_ps(__m64 a)
00853 {
00854   __m64 b;
00855   
00856   b = _mm_setzero_si64();
00857   b = _mm_cmpgt_pi8(b, a);
00858   b = _mm_unpacklo_pi8(a, b);
00859 
00860   return _mm_cvtpi16_ps(b);
00861 }
00862 
00863 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00864 _mm_cvtpu8_ps(__m64 a)
00865 {
00866   __m64 b;
00867   
00868   b = _mm_setzero_si64();
00869   b = _mm_unpacklo_pi8(a, b);
00870 
00871   return _mm_cvtpi16_ps(b);
00872 }
00873 
00874 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00875 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
00876 {
00877   __m128 c;
00878   
00879   c = _mm_setzero_ps();  
00880   c = _mm_cvtpi32_ps(c, b);
00881   c = _mm_movelh_ps(c, c);
00882 
00883   return _mm_cvtpi32_ps(c, a);
00884 }
00885 
00886 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00887 _mm_cvtps_pi16(__m128 a)
00888 {
00889   __m64 b, c;
00890   
00891   b = _mm_cvtps_pi32(a);
00892   a = _mm_movehl_ps(a, a);
00893   c = _mm_cvtps_pi32(a);
00894   
00895   return _mm_packs_pi16(b, c);
00896 }
00897 
00898 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00899 _mm_cvtps_pi8(__m128 a)
00900 {
00901   __m64 b, c;
00902   
00903   b = _mm_cvtps_pi16(a);
00904   c = _mm_setzero_si64();
00905   
00906   return _mm_packs_pi16(b, c);
00907 }
00908 
00909 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00910 _mm_movemask_ps(__m128 a)
00911 {
00912   return __builtin_ia32_movmskps(a);
00913 }
00914 
00915 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
00916 
00917 #define _MM_EXCEPT_INVALID    (0x0001)
00918 #define _MM_EXCEPT_DENORM     (0x0002)
00919 #define _MM_EXCEPT_DIV_ZERO   (0x0004)
00920 #define _MM_EXCEPT_OVERFLOW   (0x0008)
00921 #define _MM_EXCEPT_UNDERFLOW  (0x0010)
00922 #define _MM_EXCEPT_INEXACT    (0x0020)
00923 #define _MM_EXCEPT_MASK       (0x003f)
00924 
00925 #define _MM_MASK_INVALID      (0x0080)
00926 #define _MM_MASK_DENORM       (0x0100)
00927 #define _MM_MASK_DIV_ZERO     (0x0200)
00928 #define _MM_MASK_OVERFLOW     (0x0400)
00929 #define _MM_MASK_UNDERFLOW    (0x0800)
00930 #define _MM_MASK_INEXACT      (0x1000)
00931 #define _MM_MASK_MASK         (0x1f80)
00932 
00933 #define _MM_ROUND_NEAREST     (0x0000)
00934 #define _MM_ROUND_DOWN        (0x2000)
00935 #define _MM_ROUND_UP          (0x4000)
00936 #define _MM_ROUND_TOWARD_ZERO (0x6000)
00937 #define _MM_ROUND_MASK        (0x6000)
00938 
00939 #define _MM_FLUSH_ZERO_MASK   (0x8000)
00940 #define _MM_FLUSH_ZERO_ON     (0x8000)
00941 #define _MM_FLUSH_ZERO_OFF    (0x0000)
00942 
00943 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
00944 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
00945 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
00946 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
00947 
00948 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
00949 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
00950 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
00951 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
00952 
00953 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
00954 do { \
00955   __m128 tmp3, tmp2, tmp1, tmp0; \
00956   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
00957   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
00958   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
00959   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
00960   (row0) = _mm_movelh_ps(tmp0, tmp2); \
00961   (row1) = _mm_movehl_ps(tmp2, tmp0); \
00962   (row2) = _mm_movelh_ps(tmp1, tmp3); \
00963   (row3) = _mm_movehl_ps(tmp3, tmp1); \
00964 } while (0)
00965 
00966 /* Aliases for compatibility. */
00967 #define _m_pextrw _mm_extract_pi16
00968 #define _m_pinsrw _mm_insert_pi16
00969 #define _m_pmaxsw _mm_max_pi16
00970 #define _m_pmaxub _mm_max_pu8
00971 #define _m_pminsw _mm_min_pi16
00972 #define _m_pminub _mm_min_pu8
00973 #define _m_pmovmskb _mm_movemask_pi8
00974 #define _m_pmulhuw _mm_mulhi_pu16
00975 #define _m_pshufw _mm_shuffle_pi16
00976 #define _m_maskmovq _mm_maskmove_si64
00977 #define _m_pavgb _mm_avg_pu8
00978 #define _m_pavgw _mm_avg_pu16
00979 #define _m_psadbw _mm_sad_pu8
00980 #define _m_ _mm_
00981 #define _m_ _mm_
00982 
00983 /* Ugly hack for backwards-compatibility (compatible with gcc) */
00984 #ifdef __SSE2__
00985 #include <emmintrin.h>
00986 #endif
00987 
00988 #endif /* __SSE__ */
00989 
00990 #endif /* __XMMINTRIN_H */