clang API Documentation

emmintrin.h
Go to the documentation of this file.
00001 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023 
00024 #ifndef __EMMINTRIN_H
00025 #define __EMMINTRIN_H
00026 
00027 #ifndef __SSE2__
00028 #error "SSE2 instruction set not enabled"
00029 #else
00030 
00031 #include <xmmintrin.h>
00032 
00033 typedef double __m128d __attribute__((__vector_size__(16)));
00034 typedef long long __m128i __attribute__((__vector_size__(16)));
00035 
00036 /* Type defines.  */
00037 typedef double __v2df __attribute__ ((__vector_size__ (16)));
00038 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
00039 typedef short __v8hi __attribute__((__vector_size__(16)));
00040 typedef char __v16qi __attribute__((__vector_size__(16)));
00041 
00042 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00043 _mm_add_sd(__m128d a, __m128d b)
00044 {
00045   a[0] += b[0];
00046   return a;
00047 }
00048 
00049 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00050 _mm_add_pd(__m128d a, __m128d b)
00051 {
00052   return a + b;
00053 }
00054 
00055 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00056 _mm_sub_sd(__m128d a, __m128d b)
00057 {
00058   a[0] -= b[0];
00059   return a;
00060 }
00061 
00062 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00063 _mm_sub_pd(__m128d a, __m128d b)
00064 {
00065   return a - b;
00066 }
00067 
00068 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00069 _mm_mul_sd(__m128d a, __m128d b)
00070 {
00071   a[0] *= b[0];
00072   return a;
00073 }
00074 
00075 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00076 _mm_mul_pd(__m128d a, __m128d b)
00077 {
00078   return a * b;
00079 }
00080 
00081 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00082 _mm_div_sd(__m128d a, __m128d b)
00083 {
00084   a[0] /= b[0];
00085   return a;
00086 }
00087 
00088 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00089 _mm_div_pd(__m128d a, __m128d b)
00090 {
00091   return a / b;
00092 }
00093 
00094 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00095 _mm_sqrt_sd(__m128d a, __m128d b)
00096 {
00097   __m128d c = __builtin_ia32_sqrtsd(b);
00098   return (__m128d) { c[0], a[1] };
00099 }
00100 
00101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00102 _mm_sqrt_pd(__m128d a)
00103 {
00104   return __builtin_ia32_sqrtpd(a);
00105 }
00106 
00107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00108 _mm_min_sd(__m128d a, __m128d b)
00109 {
00110   return __builtin_ia32_minsd(a, b);
00111 }
00112 
00113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00114 _mm_min_pd(__m128d a, __m128d b)
00115 {
00116   return __builtin_ia32_minpd(a, b);
00117 }
00118 
00119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00120 _mm_max_sd(__m128d a, __m128d b)
00121 {
00122   return __builtin_ia32_maxsd(a, b);
00123 }
00124 
00125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00126 _mm_max_pd(__m128d a, __m128d b)
00127 {
00128   return __builtin_ia32_maxpd(a, b);
00129 }
00130 
00131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00132 _mm_and_pd(__m128d a, __m128d b)
00133 {
00134   return (__m128d)((__v4si)a & (__v4si)b);
00135 }
00136 
00137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00138 _mm_andnot_pd(__m128d a, __m128d b)
00139 {
00140   return (__m128d)(~(__v4si)a & (__v4si)b);
00141 }
00142 
00143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00144 _mm_or_pd(__m128d a, __m128d b)
00145 {
00146   return (__m128d)((__v4si)a | (__v4si)b);
00147 }
00148 
00149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00150 _mm_xor_pd(__m128d a, __m128d b)
00151 {
00152   return (__m128d)((__v4si)a ^ (__v4si)b);
00153 }
00154 
00155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00156 _mm_cmpeq_pd(__m128d a, __m128d b)
00157 {
00158   return (__m128d)__builtin_ia32_cmppd(a, b, 0);
00159 }
00160 
00161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00162 _mm_cmplt_pd(__m128d a, __m128d b)
00163 {
00164   return (__m128d)__builtin_ia32_cmppd(a, b, 1);
00165 }
00166 
00167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00168 _mm_cmple_pd(__m128d a, __m128d b)
00169 {
00170   return (__m128d)__builtin_ia32_cmppd(a, b, 2);
00171 }
00172 
00173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00174 _mm_cmpgt_pd(__m128d a, __m128d b)
00175 {
00176   return (__m128d)__builtin_ia32_cmppd(b, a, 1);
00177 }
00178 
00179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00180 _mm_cmpge_pd(__m128d a, __m128d b)
00181 {
00182   return (__m128d)__builtin_ia32_cmppd(b, a, 2);
00183 }
00184 
00185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00186 _mm_cmpord_pd(__m128d a, __m128d b)
00187 {
00188   return (__m128d)__builtin_ia32_cmppd(a, b, 7);
00189 }
00190 
00191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00192 _mm_cmpunord_pd(__m128d a, __m128d b)
00193 {
00194   return (__m128d)__builtin_ia32_cmppd(a, b, 3);
00195 }
00196 
00197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00198 _mm_cmpneq_pd(__m128d a, __m128d b)
00199 {
00200   return (__m128d)__builtin_ia32_cmppd(a, b, 4);
00201 }
00202 
00203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00204 _mm_cmpnlt_pd(__m128d a, __m128d b)
00205 {
00206   return (__m128d)__builtin_ia32_cmppd(a, b, 5);
00207 }
00208 
00209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00210 _mm_cmpnle_pd(__m128d a, __m128d b)
00211 {
00212   return (__m128d)__builtin_ia32_cmppd(a, b, 6);
00213 }
00214 
00215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00216 _mm_cmpngt_pd(__m128d a, __m128d b)
00217 {
00218   return (__m128d)__builtin_ia32_cmppd(b, a, 5);
00219 }
00220 
00221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00222 _mm_cmpnge_pd(__m128d a, __m128d b)
00223 {
00224   return (__m128d)__builtin_ia32_cmppd(b, a, 6);
00225 }
00226 
00227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00228 _mm_cmpeq_sd(__m128d a, __m128d b)
00229 {
00230   return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
00231 }
00232 
00233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00234 _mm_cmplt_sd(__m128d a, __m128d b)
00235 {
00236   return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
00237 }
00238 
00239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00240 _mm_cmple_sd(__m128d a, __m128d b)
00241 {
00242   return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
00243 }
00244 
00245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00246 _mm_cmpgt_sd(__m128d a, __m128d b)
00247 {
00248   return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
00249 }
00250 
00251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00252 _mm_cmpge_sd(__m128d a, __m128d b)
00253 {
00254   return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
00255 }
00256 
00257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00258 _mm_cmpord_sd(__m128d a, __m128d b)
00259 {
00260   return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
00261 }
00262 
00263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00264 _mm_cmpunord_sd(__m128d a, __m128d b)
00265 {
00266   return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
00267 }
00268 
00269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00270 _mm_cmpneq_sd(__m128d a, __m128d b)
00271 {
00272   return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
00273 }
00274 
00275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00276 _mm_cmpnlt_sd(__m128d a, __m128d b)
00277 {
00278   return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
00279 }
00280 
00281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00282 _mm_cmpnle_sd(__m128d a, __m128d b)
00283 {
00284   return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
00285 }
00286 
00287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00288 _mm_cmpngt_sd(__m128d a, __m128d b)
00289 {
00290   return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
00291 }
00292 
00293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00294 _mm_cmpnge_sd(__m128d a, __m128d b)
00295 {
00296   return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
00297 }
00298 
00299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00300 _mm_comieq_sd(__m128d a, __m128d b)
00301 {
00302   return __builtin_ia32_comisdeq(a, b);
00303 }
00304 
00305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00306 _mm_comilt_sd(__m128d a, __m128d b)
00307 {
00308   return __builtin_ia32_comisdlt(a, b);
00309 }
00310 
00311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00312 _mm_comile_sd(__m128d a, __m128d b)
00313 {
00314   return __builtin_ia32_comisdle(a, b);
00315 }
00316 
00317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00318 _mm_comigt_sd(__m128d a, __m128d b)
00319 {
00320   return __builtin_ia32_comisdgt(a, b);
00321 }
00322 
00323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00324 _mm_comige_sd(__m128d a, __m128d b)
00325 {
00326   return __builtin_ia32_comisdge(a, b);
00327 }
00328 
00329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00330 _mm_comineq_sd(__m128d a, __m128d b)
00331 {
00332   return __builtin_ia32_comisdneq(a, b);
00333 }
00334 
00335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00336 _mm_ucomieq_sd(__m128d a, __m128d b)
00337 {
00338   return __builtin_ia32_ucomisdeq(a, b);
00339 }
00340 
00341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00342 _mm_ucomilt_sd(__m128d a, __m128d b)
00343 {
00344   return __builtin_ia32_ucomisdlt(a, b);
00345 }
00346 
00347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00348 _mm_ucomile_sd(__m128d a, __m128d b)
00349 {
00350   return __builtin_ia32_ucomisdle(a, b);
00351 }
00352 
00353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00354 _mm_ucomigt_sd(__m128d a, __m128d b)
00355 {
00356   return __builtin_ia32_ucomisdgt(a, b);
00357 }
00358 
00359 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00360 _mm_ucomige_sd(__m128d a, __m128d b)
00361 {
00362   return __builtin_ia32_ucomisdge(a, b);
00363 }
00364 
00365 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00366 _mm_ucomineq_sd(__m128d a, __m128d b)
00367 {
00368   return __builtin_ia32_ucomisdneq(a, b);
00369 }
00370 
00371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00372 _mm_cvtpd_ps(__m128d a)
00373 {
00374   return __builtin_ia32_cvtpd2ps(a);
00375 }
00376 
00377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00378 _mm_cvtps_pd(__m128 a)
00379 {
00380   return __builtin_ia32_cvtps2pd(a);
00381 }
00382 
00383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00384 _mm_cvtepi32_pd(__m128i a)
00385 {
00386   return __builtin_ia32_cvtdq2pd((__v4si)a);
00387 }
00388 
00389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00390 _mm_cvtpd_epi32(__m128d a)
00391 {
00392   return __builtin_ia32_cvtpd2dq(a);
00393 }
00394 
00395 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00396 _mm_cvtsd_si32(__m128d a)
00397 {
00398   return __builtin_ia32_cvtsd2si(a);
00399 }
00400 
00401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
00402 _mm_cvtsd_ss(__m128 a, __m128d b)
00403 {
00404   a[0] = b[0];
00405   return a;
00406 }
00407 
00408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00409 _mm_cvtsi32_sd(__m128d a, int b)
00410 {
00411   a[0] = b;
00412   return a;
00413 }
00414 
00415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00416 _mm_cvtss_sd(__m128d a, __m128 b)
00417 {
00418   a[0] = b[0];
00419   return a;
00420 }
00421 
00422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00423 _mm_cvttpd_epi32(__m128d a)
00424 {
00425   return (__m128i)__builtin_ia32_cvttpd2dq(a);
00426 }
00427 
00428 static __inline__ int __attribute__((__always_inline__, __nodebug__))
00429 _mm_cvttsd_si32(__m128d a)
00430 {
00431   return a[0];
00432 }
00433 
00434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00435 _mm_cvtpd_pi32(__m128d a)
00436 {
00437   return (__m64)__builtin_ia32_cvtpd2pi(a);
00438 }
00439 
00440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00441 _mm_cvttpd_pi32(__m128d a)
00442 {
00443   return (__m64)__builtin_ia32_cvttpd2pi(a);
00444 }
00445 
00446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00447 _mm_cvtpi32_pd(__m64 a)
00448 {
00449   return __builtin_ia32_cvtpi2pd((__v2si)a);
00450 }
00451 
00452 static __inline__ double __attribute__((__always_inline__, __nodebug__))
00453 _mm_cvtsd_f64(__m128d a)
00454 {
00455   return a[0];
00456 }
00457 
00458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00459 _mm_load_pd(double const *dp)
00460 {
00461   return *(__m128d*)dp;
00462 }
00463 
00464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00465 _mm_load1_pd(double const *dp)
00466 {
00467   struct __mm_load1_pd_struct {
00468     double u;
00469   } __attribute__((__packed__, __may_alias__));
00470   double u = ((struct __mm_load1_pd_struct*)dp)->u;
00471   return (__m128d){ u, u };
00472 }
00473 
00474 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
00475 
00476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00477 _mm_loadr_pd(double const *dp)
00478 {
00479   __m128d u = *(__m128d*)dp;
00480   return __builtin_shufflevector(u, u, 1, 0);
00481 }
00482 
00483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00484 _mm_loadu_pd(double const *dp)
00485 {
00486   struct __loadu_pd {
00487     __m128d v;
00488   } __attribute__((packed, may_alias));
00489   return ((struct __loadu_pd*)dp)->v;
00490 }
00491 
00492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00493 _mm_load_sd(double const *dp)
00494 {
00495   struct __mm_load_sd_struct {
00496     double u;
00497   } __attribute__((__packed__, __may_alias__));
00498   double u = ((struct __mm_load_sd_struct*)dp)->u;
00499   return (__m128d){ u, 0 };
00500 }
00501 
00502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00503 _mm_loadh_pd(__m128d a, double const *dp)
00504 {
00505   struct __mm_loadh_pd_struct {
00506     double u;
00507   } __attribute__((__packed__, __may_alias__));
00508   double u = ((struct __mm_loadh_pd_struct*)dp)->u;
00509   return (__m128d){ a[0], u };
00510 }
00511 
00512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00513 _mm_loadl_pd(__m128d a, double const *dp)
00514 {
00515   struct __mm_loadl_pd_struct {
00516     double u;
00517   } __attribute__((__packed__, __may_alias__));
00518   double u = ((struct __mm_loadl_pd_struct*)dp)->u;
00519   return (__m128d){ u, a[1] }; 
00520 }
00521 
00522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00523 _mm_set_sd(double w)
00524 {
00525   return (__m128d){ w, 0 };
00526 }
00527 
00528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00529 _mm_set1_pd(double w)
00530 {
00531   return (__m128d){ w, w };
00532 }
00533 
00534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00535 _mm_set_pd(double w, double x)
00536 {
00537   return (__m128d){ x, w };
00538 }
00539 
00540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00541 _mm_setr_pd(double w, double x)
00542 {
00543   return (__m128d){ w, x };
00544 }
00545 
00546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00547 _mm_setzero_pd(void)
00548 {
00549   return (__m128d){ 0, 0 };
00550 }
00551 
00552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00553 _mm_move_sd(__m128d a, __m128d b)
00554 {
00555   return (__m128d){ b[0], a[1] };
00556 }
00557 
00558 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00559 _mm_store_sd(double *dp, __m128d a)
00560 {
00561   struct __mm_store_sd_struct {
00562     double u;
00563   } __attribute__((__packed__, __may_alias__));
00564   ((struct __mm_store_sd_struct*)dp)->u = a[0];
00565 }
00566 
00567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00568 _mm_store1_pd(double *dp, __m128d a)
00569 {
00570   struct __mm_store1_pd_struct {
00571     double u[2];
00572   } __attribute__((__packed__, __may_alias__));
00573   ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
00574   ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
00575 }
00576 
00577 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00578 _mm_store_pd(double *dp, __m128d a)
00579 {
00580   *(__m128d *)dp = a;
00581 }
00582 
00583 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00584 _mm_storeu_pd(double *dp, __m128d a)
00585 {
00586   __builtin_ia32_storeupd(dp, a);
00587 }
00588 
00589 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00590 _mm_storer_pd(double *dp, __m128d a)
00591 {
00592   a = __builtin_shufflevector(a, a, 1, 0);
00593   *(__m128d *)dp = a;
00594 }
00595 
00596 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00597 _mm_storeh_pd(double *dp, __m128d a)
00598 {
00599   struct __mm_storeh_pd_struct {
00600     double u;
00601   } __attribute__((__packed__, __may_alias__));
00602   ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
00603 }
00604 
00605 static __inline__ void __attribute__((__always_inline__, __nodebug__))
00606 _mm_storel_pd(double *dp, __m128d a)
00607 {
00608   struct __mm_storeh_pd_struct {
00609     double u;
00610   } __attribute__((__packed__, __may_alias__));
00611   ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
00612 }
00613 
00614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00615 _mm_add_epi8(__m128i a, __m128i b)
00616 {
00617   return (__m128i)((__v16qi)a + (__v16qi)b);
00618 }
00619 
00620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00621 _mm_add_epi16(__m128i a, __m128i b)
00622 {
00623   return (__m128i)((__v8hi)a + (__v8hi)b);
00624 }
00625 
00626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00627 _mm_add_epi32(__m128i a, __m128i b)
00628 {
00629   return (__m128i)((__v4si)a + (__v4si)b);
00630 }
00631 
00632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00633 _mm_add_si64(__m64 a, __m64 b)
00634 {
00635   return a + b;
00636 }
00637 
00638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00639 _mm_add_epi64(__m128i a, __m128i b)
00640 {
00641   return a + b;
00642 }
00643 
00644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00645 _mm_adds_epi8(__m128i a, __m128i b)
00646 {
00647   return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
00648 }
00649 
00650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00651 _mm_adds_epi16(__m128i a, __m128i b)
00652 {
00653   return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
00654 }
00655 
00656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00657 _mm_adds_epu8(__m128i a, __m128i b)
00658 {
00659   return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
00660 }
00661 
00662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00663 _mm_adds_epu16(__m128i a, __m128i b)
00664 {
00665   return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
00666 }
00667 
00668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00669 _mm_avg_epu8(__m128i a, __m128i b)
00670 {
00671   return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
00672 }
00673 
00674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00675 _mm_avg_epu16(__m128i a, __m128i b)
00676 {
00677   return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
00678 }
00679 
00680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00681 _mm_madd_epi16(__m128i a, __m128i b)
00682 {
00683   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
00684 }
00685 
00686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00687 _mm_max_epi16(__m128i a, __m128i b)
00688 {
00689   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
00690 }
00691 
00692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00693 _mm_max_epu8(__m128i a, __m128i b)
00694 {
00695   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
00696 }
00697 
00698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00699 _mm_min_epi16(__m128i a, __m128i b)
00700 {
00701   return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
00702 }
00703 
00704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00705 _mm_min_epu8(__m128i a, __m128i b)
00706 {
00707   return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
00708 }
00709 
00710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00711 _mm_mulhi_epi16(__m128i a, __m128i b)
00712 {
00713   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
00714 }
00715 
00716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00717 _mm_mulhi_epu16(__m128i a, __m128i b)
00718 {
00719   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
00720 }
00721 
00722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00723 _mm_mullo_epi16(__m128i a, __m128i b)
00724 {
00725   return (__m128i)((__v8hi)a * (__v8hi)b);
00726 }
00727 
00728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00729 _mm_mul_su32(__m64 a, __m64 b)
00730 {
00731   return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
00732 }
00733 
00734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00735 _mm_mul_epu32(__m128i a, __m128i b)
00736 {
00737   return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
00738 }
00739 
00740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00741 _mm_sad_epu8(__m128i a, __m128i b)
00742 {
00743   return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
00744 }
00745 
00746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00747 _mm_sub_epi8(__m128i a, __m128i b)
00748 {
00749   return (__m128i)((__v16qi)a - (__v16qi)b);
00750 }
00751 
00752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00753 _mm_sub_epi16(__m128i a, __m128i b)
00754 {
00755   return (__m128i)((__v8hi)a - (__v8hi)b);
00756 }
00757 
00758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00759 _mm_sub_epi32(__m128i a, __m128i b)
00760 {
00761   return (__m128i)((__v4si)a - (__v4si)b);
00762 }
00763 
00764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
00765 _mm_sub_si64(__m64 a, __m64 b)
00766 {
00767   return a - b;
00768 }
00769 
00770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00771 _mm_sub_epi64(__m128i a, __m128i b)
00772 {
00773   return a - b;
00774 }
00775 
00776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00777 _mm_subs_epi8(__m128i a, __m128i b)
00778 {
00779   return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
00780 }
00781 
00782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00783 _mm_subs_epi16(__m128i a, __m128i b)
00784 {
00785   return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
00786 }
00787 
00788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00789 _mm_subs_epu8(__m128i a, __m128i b)
00790 {
00791   return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
00792 }
00793 
00794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00795 _mm_subs_epu16(__m128i a, __m128i b)
00796 {
00797   return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
00798 }
00799 
00800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00801 _mm_and_si128(__m128i a, __m128i b)
00802 {
00803   return a & b;
00804 }
00805 
00806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00807 _mm_andnot_si128(__m128i a, __m128i b)
00808 {
00809   return ~a & b;
00810 }
00811 
00812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00813 _mm_or_si128(__m128i a, __m128i b)
00814 {
00815   return a | b;
00816 }
00817 
00818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00819 _mm_xor_si128(__m128i a, __m128i b)
00820 {
00821   return a ^ b;
00822 }
00823 
00824 #define _mm_slli_si128(a, count) __extension__ ({ \
00825   __m128i __a = (a); \
00826   (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); })
00827 
00828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00829 _mm_slli_epi16(__m128i a, int count)
00830 {
00831   return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
00832 }
00833 
00834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00835 _mm_sll_epi16(__m128i a, __m128i count)
00836 {
00837   return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
00838 }
00839 
00840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00841 _mm_slli_epi32(__m128i a, int count)
00842 {
00843   return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
00844 }
00845 
00846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00847 _mm_sll_epi32(__m128i a, __m128i count)
00848 {
00849   return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
00850 }
00851 
00852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00853 _mm_slli_epi64(__m128i a, int count)
00854 {
00855   return __builtin_ia32_psllqi128(a, count);
00856 }
00857 
00858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00859 _mm_sll_epi64(__m128i a, __m128i count)
00860 {
00861   return __builtin_ia32_psllq128(a, count);
00862 }
00863 
00864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00865 _mm_srai_epi16(__m128i a, int count)
00866 {
00867   return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
00868 }
00869 
00870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00871 _mm_sra_epi16(__m128i a, __m128i count)
00872 {
00873   return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
00874 }
00875 
00876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00877 _mm_srai_epi32(__m128i a, int count)
00878 {
00879   return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
00880 }
00881 
00882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00883 _mm_sra_epi32(__m128i a, __m128i count)
00884 {
00885   return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
00886 }
00887 
00888 
00889 #define _mm_srli_si128(a, count) __extension__ ({ \
00890   __m128i __a = (a); \
00891   (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); })
00892 
00893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00894 _mm_srli_epi16(__m128i a, int count)
00895 {
00896   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
00897 }
00898 
00899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00900 _mm_srl_epi16(__m128i a, __m128i count)
00901 {
00902   return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
00903 }
00904 
00905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00906 _mm_srli_epi32(__m128i a, int count)
00907 {
00908   return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
00909 }
00910 
00911 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00912 _mm_srl_epi32(__m128i a, __m128i count)
00913 {
00914   return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
00915 }
00916 
00917 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00918 _mm_srli_epi64(__m128i a, int count)
00919 {
00920   return __builtin_ia32_psrlqi128(a, count);
00921 }
00922 
00923 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00924 _mm_srl_epi64(__m128i a, __m128i count)
00925 {
00926   return __builtin_ia32_psrlq128(a, count);
00927 }
00928 
00929 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00930 _mm_cmpeq_epi8(__m128i a, __m128i b)
00931 {
00932   return (__m128i)((__v16qi)a == (__v16qi)b);
00933 }
00934 
00935 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00936 _mm_cmpeq_epi16(__m128i a, __m128i b)
00937 {
00938   return (__m128i)((__v8hi)a == (__v8hi)b);
00939 }
00940 
00941 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00942 _mm_cmpeq_epi32(__m128i a, __m128i b)
00943 {
00944   return (__m128i)((__v4si)a == (__v4si)b);
00945 }
00946 
00947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00948 _mm_cmpgt_epi8(__m128i a, __m128i b)
00949 {
00950   /* This function always performs a signed comparison, but __v16qi is a char
00951      which may be signed or unsigned. */
00952   typedef signed char __v16qs __attribute__((__vector_size__(16)));
00953   return (__m128i)((__v16qs)a > (__v16qs)b);
00954 }
00955 
00956 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00957 _mm_cmpgt_epi16(__m128i a, __m128i b)
00958 {
00959   return (__m128i)((__v8hi)a > (__v8hi)b);
00960 }
00961 
00962 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00963 _mm_cmpgt_epi32(__m128i a, __m128i b)
00964 {
00965   return (__m128i)((__v4si)a > (__v4si)b);
00966 }
00967 
00968 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00969 _mm_cmplt_epi8(__m128i a, __m128i b)
00970 {
00971   return _mm_cmpgt_epi8(b,a);
00972 }
00973 
00974 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00975 _mm_cmplt_epi16(__m128i a, __m128i b)
00976 {
00977   return _mm_cmpgt_epi16(b,a);
00978 }
00979 
00980 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
00981 _mm_cmplt_epi32(__m128i a, __m128i b)
00982 {
00983   return _mm_cmpgt_epi32(b,a);
00984 }
00985 
00986 #ifdef __x86_64__
00987 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
00988 _mm_cvtsi64_sd(__m128d a, long long b)
00989 {
00990   a[0] = b;
00991   return a;
00992 }
00993 
00994 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
00995 _mm_cvtsd_si64(__m128d a)
00996 {
00997   return __builtin_ia32_cvtsd2si64(a);
00998 }
00999 
01000 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
01001 _mm_cvttsd_si64(__m128d a)
01002 {
01003   return a[0];
01004 }
01005 #endif
01006 
01007 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01008 _mm_cvtepi32_ps(__m128i a)
01009 {
01010   return __builtin_ia32_cvtdq2ps((__v4si)a);
01011 }
01012 
01013 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01014 _mm_cvtps_epi32(__m128 a)
01015 {
01016   return (__m128i)__builtin_ia32_cvtps2dq(a);
01017 }
01018 
01019 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01020 _mm_cvttps_epi32(__m128 a)
01021 {
01022   return (__m128i)__builtin_ia32_cvttps2dq(a);
01023 }
01024 
01025 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01026 _mm_cvtsi32_si128(int a)
01027 {
01028   return (__m128i)(__v4si){ a, 0, 0, 0 };
01029 }
01030 
01031 #ifdef __x86_64__
01032 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01033 _mm_cvtsi64_si128(long long a)
01034 {
01035   return (__m128i){ a, 0 };
01036 }
01037 #endif
01038 
01039 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01040 _mm_cvtsi128_si32(__m128i a)
01041 {
01042   __v4si b = (__v4si)a;
01043   return b[0];
01044 }
01045 
01046 #ifdef __x86_64__
01047 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
01048 _mm_cvtsi128_si64(__m128i a)
01049 {
01050   return a[0];
01051 }
01052 #endif
01053 
01054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01055 _mm_load_si128(__m128i const *p)
01056 {
01057   return *p;
01058 }
01059 
01060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01061 _mm_loadu_si128(__m128i const *p)
01062 {
01063   struct __loadu_si128 {
01064     __m128i v;
01065   } __attribute__((packed, may_alias));
01066   return ((struct __loadu_si128*)p)->v;
01067 }
01068 
01069 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01070 _mm_loadl_epi64(__m128i const *p)
01071 {
01072   struct __mm_loadl_epi64_struct {
01073     long long u;
01074   } __attribute__((__packed__, __may_alias__));
01075   return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
01076 }
01077 
01078 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01079 _mm_set_epi64x(long long q1, long long q0)
01080 {
01081   return (__m128i){ q0, q1 };
01082 }
01083 
01084 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01085 _mm_set_epi64(__m64 q1, __m64 q0)
01086 {
01087   return (__m128i){ (long long)q0, (long long)q1 };
01088 }
01089 
01090 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01091 _mm_set_epi32(int i3, int i2, int i1, int i0)
01092 {
01093   return (__m128i)(__v4si){ i0, i1, i2, i3};
01094 }
01095 
01096 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01097 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
01098 {
01099   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
01100 }
01101 
01102 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01103 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
01104 {
01105   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
01106 }
01107 
01108 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01109 _mm_set1_epi64x(long long q)
01110 {
01111   return (__m128i){ q, q };
01112 }
01113 
01114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01115 _mm_set1_epi64(__m64 q)
01116 {
01117   return (__m128i){ (long long)q, (long long)q };
01118 }
01119 
01120 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01121 _mm_set1_epi32(int i)
01122 {
01123   return (__m128i)(__v4si){ i, i, i, i };
01124 }
01125 
01126 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01127 _mm_set1_epi16(short w)
01128 {
01129   return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
01130 }
01131 
01132 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01133 _mm_set1_epi8(char b)
01134 {
01135   return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
01136 }
01137 
01138 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01139 _mm_setr_epi64(__m64 q0, __m64 q1)
01140 {
01141   return (__m128i){ (long long)q0, (long long)q1 };
01142 }
01143 
01144 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01145 _mm_setr_epi32(int i0, int i1, int i2, int i3)
01146 {
01147   return (__m128i)(__v4si){ i0, i1, i2, i3};
01148 }
01149 
01150 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01151 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
01152 {
01153   return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
01154 }
01155 
01156 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01157 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
01158 {
01159   return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
01160 }
01161 
01162 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01163 _mm_setzero_si128(void)
01164 {
01165   return (__m128i){ 0LL, 0LL };
01166 }
01167 
01168 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01169 _mm_store_si128(__m128i *p, __m128i b)
01170 {
01171   *p = b;
01172 }
01173 
01174 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01175 _mm_storeu_si128(__m128i *p, __m128i b)
01176 {
01177   __builtin_ia32_storedqu((char *)p, (__v16qi)b);
01178 }
01179 
01180 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01181 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
01182 {
01183   __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
01184 }
01185 
01186 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01187 _mm_storel_epi64(__m128i *p, __m128i a)
01188 {
01189   struct __mm_storel_epi64_struct {
01190     long long u;
01191   } __attribute__((__packed__, __may_alias__));
01192   ((struct __mm_storel_epi64_struct*)p)->u = a[0];
01193 }
01194 
01195 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01196 _mm_stream_pd(double *p, __m128d a)
01197 {
01198   __builtin_ia32_movntpd(p, a);
01199 }
01200 
01201 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01202 _mm_stream_si128(__m128i *p, __m128i a)
01203 {
01204   __builtin_ia32_movntdq(p, a);
01205 }
01206 
01207 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01208 _mm_stream_si32(int *p, int a)
01209 {
01210   __builtin_ia32_movnti(p, a);
01211 }
01212 
01213 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01214 _mm_clflush(void const *p)
01215 {
01216   __builtin_ia32_clflush(p);
01217 }
01218 
01219 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01220 _mm_lfence(void)
01221 {
01222   __builtin_ia32_lfence();
01223 }
01224 
01225 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01226 _mm_mfence(void)
01227 {
01228   __builtin_ia32_mfence();
01229 }
01230 
01231 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01232 _mm_packs_epi16(__m128i a, __m128i b)
01233 {
01234   return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
01235 }
01236 
01237 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01238 _mm_packs_epi32(__m128i a, __m128i b)
01239 {
01240   return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
01241 }
01242 
01243 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01244 _mm_packus_epi16(__m128i a, __m128i b)
01245 {
01246   return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
01247 }
01248 
01249 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01250 _mm_extract_epi16(__m128i a, int imm)
01251 {
01252   __v8hi b = (__v8hi)a;
01253   return (unsigned short)b[imm];
01254 }
01255 
01256 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01257 _mm_insert_epi16(__m128i a, int b, int imm)
01258 {
01259   __v8hi c = (__v8hi)a;
01260   c[imm & 7] = b;
01261   return (__m128i)c;
01262 }
01263 
01264 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01265 _mm_movemask_epi8(__m128i a)
01266 {
01267   return __builtin_ia32_pmovmskb128((__v16qi)a);
01268 }
01269 
01270 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
01271   __m128i __a = (a); \
01272   (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \
01273                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
01274                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
01275 
01276 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
01277   __m128i __a = (a); \
01278   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
01279                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
01280                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
01281                                    4, 5, 6, 7); })
01282 
01283 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
01284   __m128i __a = (a); \
01285   (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \
01286                                    0, 1, 2, 3, \
01287                                    4 + (((imm) & 0x03) >> 0), \
01288                                    4 + (((imm) & 0x0c) >> 2), \
01289                                    4 + (((imm) & 0x30) >> 4), \
01290                                    4 + (((imm) & 0xc0) >> 6)); })
01291 
01292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01293 _mm_unpackhi_epi8(__m128i a, __m128i b)
01294 {
01295   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
01296 }
01297 
01298 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01299 _mm_unpackhi_epi16(__m128i a, __m128i b)
01300 {
01301   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
01302 }
01303 
01304 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01305 _mm_unpackhi_epi32(__m128i a, __m128i b)
01306 {
01307   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
01308 }
01309 
01310 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01311 _mm_unpackhi_epi64(__m128i a, __m128i b)
01312 {
01313   return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
01314 }
01315 
01316 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01317 _mm_unpacklo_epi8(__m128i a, __m128i b)
01318 {
01319   return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
01320 }
01321 
01322 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01323 _mm_unpacklo_epi16(__m128i a, __m128i b)
01324 {
01325   return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
01326 }
01327 
01328 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01329 _mm_unpacklo_epi32(__m128i a, __m128i b)
01330 {
01331   return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
01332 }
01333 
01334 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01335 _mm_unpacklo_epi64(__m128i a, __m128i b)
01336 {
01337   return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
01338 }
01339 
01340 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
01341 _mm_movepi64_pi64(__m128i a)
01342 {
01343   return (__m64)a[0];
01344 }
01345 
01346 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01347 _mm_movpi64_pi64(__m64 a)
01348 {
01349   return (__m128i){ (long long)a, 0 };
01350 }
01351 
01352 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01353 _mm_move_epi64(__m128i a)
01354 {
01355   return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
01356 }
01357 
01358 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01359 _mm_unpackhi_pd(__m128d a, __m128d b)
01360 {
01361   return __builtin_shufflevector(a, b, 1, 2+1);
01362 }
01363 
01364 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01365 _mm_unpacklo_pd(__m128d a, __m128d b)
01366 {
01367   return __builtin_shufflevector(a, b, 0, 2+0);
01368 }
01369 
01370 static __inline__ int __attribute__((__always_inline__, __nodebug__))
01371 _mm_movemask_pd(__m128d a)
01372 {
01373   return __builtin_ia32_movmskpd(a);
01374 }
01375 
01376 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
01377   __m128d __a = (a); \
01378   __m128d __b = (b); \
01379   __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); })
01380 
01381 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01382 _mm_castpd_ps(__m128d in)
01383 {
01384   return (__m128)in;
01385 }
01386 
01387 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01388 _mm_castpd_si128(__m128d in)
01389 {
01390   return (__m128i)in;
01391 }
01392 
01393 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01394 _mm_castps_pd(__m128 in)
01395 {
01396   return (__m128d)in;
01397 }
01398 
01399 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
01400 _mm_castps_si128(__m128 in)
01401 {
01402   return (__m128i)in;
01403 }
01404 
01405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
01406 _mm_castsi128_ps(__m128i in)
01407 {
01408   return (__m128)in;
01409 }
01410 
01411 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
01412 _mm_castsi128_pd(__m128i in)
01413 {
01414   return (__m128d)in;
01415 }
01416 
01417 static __inline__ void __attribute__((__always_inline__, __nodebug__))
01418 _mm_pause(void)
01419 {
01420   __asm__ volatile ("pause");
01421 }
01422 
01423 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
01424 
01425 #endif /* __SSE2__ */
01426 
01427 #endif /* __EMMINTRIN_H */