clang API Documentation
00001 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 00024 #ifndef __EMMINTRIN_H 00025 #define __EMMINTRIN_H 00026 00027 #ifndef __SSE2__ 00028 #error "SSE2 instruction set not enabled" 00029 #else 00030 00031 #include <xmmintrin.h> 00032 00033 typedef double __m128d __attribute__((__vector_size__(16))); 00034 typedef long long __m128i __attribute__((__vector_size__(16))); 00035 00036 /* Type defines. */ 00037 typedef double __v2df __attribute__ ((__vector_size__ (16))); 00038 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 00039 typedef short __v8hi __attribute__((__vector_size__(16))); 00040 typedef char __v16qi __attribute__((__vector_size__(16))); 00041 00042 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00043 _mm_add_sd(__m128d a, __m128d b) 00044 { 00045 a[0] += b[0]; 00046 return a; 00047 } 00048 00049 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00050 _mm_add_pd(__m128d a, __m128d b) 00051 { 00052 return a + b; 00053 } 00054 00055 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00056 _mm_sub_sd(__m128d a, __m128d b) 00057 { 00058 a[0] -= b[0]; 00059 return a; 00060 } 00061 00062 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00063 _mm_sub_pd(__m128d a, __m128d b) 00064 { 00065 return a - b; 00066 } 00067 00068 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00069 _mm_mul_sd(__m128d a, __m128d b) 00070 { 00071 a[0] *= b[0]; 00072 return a; 00073 } 00074 00075 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00076 _mm_mul_pd(__m128d a, __m128d b) 00077 { 00078 return a * b; 00079 } 00080 00081 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00082 _mm_div_sd(__m128d a, __m128d b) 00083 { 00084 a[0] /= b[0]; 00085 return a; 00086 } 00087 00088 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00089 _mm_div_pd(__m128d a, __m128d b) 00090 { 00091 return a / b; 00092 } 00093 00094 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00095 _mm_sqrt_sd(__m128d a, __m128d b) 00096 { 00097 __m128d c = __builtin_ia32_sqrtsd(b); 00098 return (__m128d) { c[0], a[1] }; 00099 } 00100 00101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00102 _mm_sqrt_pd(__m128d a) 00103 { 00104 return __builtin_ia32_sqrtpd(a); 00105 } 00106 00107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00108 _mm_min_sd(__m128d a, __m128d b) 00109 { 00110 return __builtin_ia32_minsd(a, b); 00111 } 00112 00113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00114 _mm_min_pd(__m128d a, __m128d b) 00115 { 00116 return __builtin_ia32_minpd(a, b); 00117 } 00118 00119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00120 _mm_max_sd(__m128d a, __m128d b) 00121 { 00122 return __builtin_ia32_maxsd(a, b); 00123 } 00124 00125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00126 _mm_max_pd(__m128d a, __m128d b) 00127 { 00128 return __builtin_ia32_maxpd(a, b); 00129 } 00130 00131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00132 _mm_and_pd(__m128d a, __m128d b) 00133 { 00134 return (__m128d)((__v4si)a & (__v4si)b); 00135 } 00136 00137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00138 _mm_andnot_pd(__m128d a, __m128d b) 00139 { 00140 return (__m128d)(~(__v4si)a & (__v4si)b); 00141 } 00142 00143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00144 _mm_or_pd(__m128d a, __m128d b) 00145 { 00146 return (__m128d)((__v4si)a | (__v4si)b); 00147 } 00148 00149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00150 _mm_xor_pd(__m128d a, __m128d b) 00151 { 00152 return (__m128d)((__v4si)a ^ (__v4si)b); 00153 } 00154 00155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00156 _mm_cmpeq_pd(__m128d a, __m128d b) 00157 { 00158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 00159 } 00160 00161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00162 _mm_cmplt_pd(__m128d a, __m128d b) 00163 { 00164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 00165 } 00166 00167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00168 _mm_cmple_pd(__m128d a, __m128d b) 00169 { 00170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 00171 } 00172 00173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00174 _mm_cmpgt_pd(__m128d a, __m128d b) 00175 { 00176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 00177 } 00178 00179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00180 _mm_cmpge_pd(__m128d a, __m128d b) 00181 { 00182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 00183 } 00184 00185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00186 _mm_cmpord_pd(__m128d a, __m128d b) 00187 { 00188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 00189 } 00190 00191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00192 _mm_cmpunord_pd(__m128d a, __m128d b) 00193 { 00194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 00195 } 00196 00197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00198 _mm_cmpneq_pd(__m128d a, __m128d b) 00199 { 00200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 00201 } 00202 00203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00204 _mm_cmpnlt_pd(__m128d a, __m128d b) 00205 { 00206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 00207 } 00208 00209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00210 _mm_cmpnle_pd(__m128d a, __m128d b) 00211 { 00212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 00213 } 00214 00215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00216 _mm_cmpngt_pd(__m128d a, __m128d b) 00217 { 00218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 00219 } 00220 00221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00222 _mm_cmpnge_pd(__m128d a, __m128d b) 00223 { 00224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 00225 } 00226 00227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00228 _mm_cmpeq_sd(__m128d a, __m128d b) 00229 { 00230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 00231 } 00232 00233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00234 _mm_cmplt_sd(__m128d a, __m128d b) 00235 { 00236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 00237 } 00238 00239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00240 _mm_cmple_sd(__m128d a, __m128d b) 00241 { 00242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 00243 } 00244 00245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00246 _mm_cmpgt_sd(__m128d a, __m128d b) 00247 { 00248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 00249 } 00250 00251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00252 _mm_cmpge_sd(__m128d a, __m128d b) 00253 { 00254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 00255 } 00256 00257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00258 _mm_cmpord_sd(__m128d a, __m128d b) 00259 { 00260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 00261 } 00262 00263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00264 _mm_cmpunord_sd(__m128d a, __m128d b) 00265 { 00266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 00267 } 00268 00269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00270 _mm_cmpneq_sd(__m128d a, __m128d b) 00271 { 00272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 00273 } 00274 00275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00276 _mm_cmpnlt_sd(__m128d a, __m128d b) 00277 { 00278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 00279 } 00280 00281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00282 _mm_cmpnle_sd(__m128d a, __m128d b) 00283 { 00284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 00285 } 00286 00287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00288 _mm_cmpngt_sd(__m128d a, __m128d b) 00289 { 00290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 00291 } 00292 00293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00294 _mm_cmpnge_sd(__m128d a, __m128d b) 00295 { 00296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 00297 } 00298 00299 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00300 _mm_comieq_sd(__m128d a, __m128d b) 00301 { 00302 return __builtin_ia32_comisdeq(a, b); 00303 } 00304 00305 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00306 _mm_comilt_sd(__m128d a, __m128d b) 00307 { 00308 return __builtin_ia32_comisdlt(a, b); 00309 } 00310 00311 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00312 _mm_comile_sd(__m128d a, __m128d b) 00313 { 00314 return __builtin_ia32_comisdle(a, b); 00315 } 00316 00317 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00318 _mm_comigt_sd(__m128d a, __m128d b) 00319 { 00320 return __builtin_ia32_comisdgt(a, b); 00321 } 00322 00323 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00324 _mm_comige_sd(__m128d a, __m128d b) 00325 { 00326 return __builtin_ia32_comisdge(a, b); 00327 } 00328 00329 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00330 _mm_comineq_sd(__m128d a, __m128d b) 00331 { 00332 return __builtin_ia32_comisdneq(a, b); 00333 } 00334 00335 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00336 _mm_ucomieq_sd(__m128d a, __m128d b) 00337 { 00338 return __builtin_ia32_ucomisdeq(a, b); 00339 } 00340 00341 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00342 _mm_ucomilt_sd(__m128d a, __m128d b) 00343 { 00344 return __builtin_ia32_ucomisdlt(a, b); 00345 } 00346 00347 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00348 _mm_ucomile_sd(__m128d a, __m128d b) 00349 { 00350 return __builtin_ia32_ucomisdle(a, b); 00351 } 00352 00353 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00354 _mm_ucomigt_sd(__m128d a, __m128d b) 00355 { 00356 return __builtin_ia32_ucomisdgt(a, b); 00357 } 00358 00359 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00360 _mm_ucomige_sd(__m128d a, __m128d b) 00361 { 00362 return __builtin_ia32_ucomisdge(a, b); 00363 } 00364 00365 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00366 _mm_ucomineq_sd(__m128d a, __m128d b) 00367 { 00368 return __builtin_ia32_ucomisdneq(a, b); 00369 } 00370 00371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00372 _mm_cvtpd_ps(__m128d a) 00373 { 00374 return __builtin_ia32_cvtpd2ps(a); 00375 } 00376 00377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00378 _mm_cvtps_pd(__m128 a) 00379 { 00380 return __builtin_ia32_cvtps2pd(a); 00381 } 00382 00383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00384 _mm_cvtepi32_pd(__m128i a) 00385 { 00386 return __builtin_ia32_cvtdq2pd((__v4si)a); 00387 } 00388 00389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00390 _mm_cvtpd_epi32(__m128d a) 00391 { 00392 return __builtin_ia32_cvtpd2dq(a); 00393 } 00394 00395 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00396 _mm_cvtsd_si32(__m128d a) 00397 { 00398 return __builtin_ia32_cvtsd2si(a); 00399 } 00400 00401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 00402 _mm_cvtsd_ss(__m128 a, __m128d b) 00403 { 00404 a[0] = b[0]; 00405 return a; 00406 } 00407 00408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00409 _mm_cvtsi32_sd(__m128d a, int b) 00410 { 00411 a[0] = b; 00412 return a; 00413 } 00414 00415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00416 _mm_cvtss_sd(__m128d a, __m128 b) 00417 { 00418 a[0] = b[0]; 00419 return a; 00420 } 00421 00422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00423 _mm_cvttpd_epi32(__m128d a) 00424 { 00425 return (__m128i)__builtin_ia32_cvttpd2dq(a); 00426 } 00427 00428 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 00429 _mm_cvttsd_si32(__m128d a) 00430 { 00431 return a[0]; 00432 } 00433 00434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00435 _mm_cvtpd_pi32(__m128d a) 00436 { 00437 return (__m64)__builtin_ia32_cvtpd2pi(a); 00438 } 00439 00440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00441 _mm_cvttpd_pi32(__m128d a) 00442 { 00443 return (__m64)__builtin_ia32_cvttpd2pi(a); 00444 } 00445 00446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00447 _mm_cvtpi32_pd(__m64 a) 00448 { 00449 return __builtin_ia32_cvtpi2pd((__v2si)a); 00450 } 00451 00452 static __inline__ double __attribute__((__always_inline__, __nodebug__)) 00453 _mm_cvtsd_f64(__m128d a) 00454 { 00455 return a[0]; 00456 } 00457 00458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00459 _mm_load_pd(double const *dp) 00460 { 00461 return *(__m128d*)dp; 00462 } 00463 00464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00465 _mm_load1_pd(double const *dp) 00466 { 00467 struct __mm_load1_pd_struct { 00468 double u; 00469 } __attribute__((__packed__, __may_alias__)); 00470 double u = ((struct __mm_load1_pd_struct*)dp)->u; 00471 return (__m128d){ u, u }; 00472 } 00473 00474 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 00475 00476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00477 _mm_loadr_pd(double const *dp) 00478 { 00479 __m128d u = *(__m128d*)dp; 00480 return __builtin_shufflevector(u, u, 1, 0); 00481 } 00482 00483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00484 _mm_loadu_pd(double const *dp) 00485 { 00486 struct __loadu_pd { 00487 __m128d v; 00488 } __attribute__((packed, may_alias)); 00489 return ((struct __loadu_pd*)dp)->v; 00490 } 00491 00492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00493 _mm_load_sd(double const *dp) 00494 { 00495 struct __mm_load_sd_struct { 00496 double u; 00497 } __attribute__((__packed__, __may_alias__)); 00498 double u = ((struct __mm_load_sd_struct*)dp)->u; 00499 return (__m128d){ u, 0 }; 00500 } 00501 00502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00503 _mm_loadh_pd(__m128d a, double const *dp) 00504 { 00505 struct __mm_loadh_pd_struct { 00506 double u; 00507 } __attribute__((__packed__, __may_alias__)); 00508 double u = ((struct __mm_loadh_pd_struct*)dp)->u; 00509 return (__m128d){ a[0], u }; 00510 } 00511 00512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00513 _mm_loadl_pd(__m128d a, double const *dp) 00514 { 00515 struct __mm_loadl_pd_struct { 00516 double u; 00517 } __attribute__((__packed__, __may_alias__)); 00518 double u = ((struct __mm_loadl_pd_struct*)dp)->u; 00519 return (__m128d){ u, a[1] }; 00520 } 00521 00522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00523 _mm_set_sd(double w) 00524 { 00525 return (__m128d){ w, 0 }; 00526 } 00527 00528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00529 _mm_set1_pd(double w) 00530 { 00531 return (__m128d){ w, w }; 00532 } 00533 00534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00535 _mm_set_pd(double w, double x) 00536 { 00537 return (__m128d){ x, w }; 00538 } 00539 00540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00541 _mm_setr_pd(double w, double x) 00542 { 00543 return (__m128d){ w, x }; 00544 } 00545 00546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00547 _mm_setzero_pd(void) 00548 { 00549 return (__m128d){ 0, 0 }; 00550 } 00551 00552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00553 _mm_move_sd(__m128d a, __m128d b) 00554 { 00555 return (__m128d){ b[0], a[1] }; 00556 } 00557 00558 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00559 _mm_store_sd(double *dp, __m128d a) 00560 { 00561 struct __mm_store_sd_struct { 00562 double u; 00563 } __attribute__((__packed__, __may_alias__)); 00564 ((struct __mm_store_sd_struct*)dp)->u = a[0]; 00565 } 00566 00567 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00568 _mm_store1_pd(double *dp, __m128d a) 00569 { 00570 struct __mm_store1_pd_struct { 00571 double u[2]; 00572 } __attribute__((__packed__, __may_alias__)); 00573 ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0]; 00574 ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0]; 00575 } 00576 00577 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00578 _mm_store_pd(double *dp, __m128d a) 00579 { 00580 *(__m128d *)dp = a; 00581 } 00582 00583 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00584 _mm_storeu_pd(double *dp, __m128d a) 00585 { 00586 __builtin_ia32_storeupd(dp, a); 00587 } 00588 00589 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00590 _mm_storer_pd(double *dp, __m128d a) 00591 { 00592 a = __builtin_shufflevector(a, a, 1, 0); 00593 *(__m128d *)dp = a; 00594 } 00595 00596 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00597 _mm_storeh_pd(double *dp, __m128d a) 00598 { 00599 struct __mm_storeh_pd_struct { 00600 double u; 00601 } __attribute__((__packed__, __may_alias__)); 00602 ((struct __mm_storeh_pd_struct*)dp)->u = a[1]; 00603 } 00604 00605 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 00606 _mm_storel_pd(double *dp, __m128d a) 00607 { 00608 struct __mm_storeh_pd_struct { 00609 double u; 00610 } __attribute__((__packed__, __may_alias__)); 00611 ((struct __mm_storeh_pd_struct*)dp)->u = a[0]; 00612 } 00613 00614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00615 _mm_add_epi8(__m128i a, __m128i b) 00616 { 00617 return (__m128i)((__v16qi)a + (__v16qi)b); 00618 } 00619 00620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00621 _mm_add_epi16(__m128i a, __m128i b) 00622 { 00623 return (__m128i)((__v8hi)a + (__v8hi)b); 00624 } 00625 00626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00627 _mm_add_epi32(__m128i a, __m128i b) 00628 { 00629 return (__m128i)((__v4si)a + (__v4si)b); 00630 } 00631 00632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00633 _mm_add_si64(__m64 a, __m64 b) 00634 { 00635 return a + b; 00636 } 00637 00638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00639 _mm_add_epi64(__m128i a, __m128i b) 00640 { 00641 return a + b; 00642 } 00643 00644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00645 _mm_adds_epi8(__m128i a, __m128i b) 00646 { 00647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 00648 } 00649 00650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00651 _mm_adds_epi16(__m128i a, __m128i b) 00652 { 00653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 00654 } 00655 00656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00657 _mm_adds_epu8(__m128i a, __m128i b) 00658 { 00659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 00660 } 00661 00662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00663 _mm_adds_epu16(__m128i a, __m128i b) 00664 { 00665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 00666 } 00667 00668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00669 _mm_avg_epu8(__m128i a, __m128i b) 00670 { 00671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 00672 } 00673 00674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00675 _mm_avg_epu16(__m128i a, __m128i b) 00676 { 00677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 00678 } 00679 00680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00681 _mm_madd_epi16(__m128i a, __m128i b) 00682 { 00683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 00684 } 00685 00686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00687 _mm_max_epi16(__m128i a, __m128i b) 00688 { 00689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 00690 } 00691 00692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00693 _mm_max_epu8(__m128i a, __m128i b) 00694 { 00695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 00696 } 00697 00698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00699 _mm_min_epi16(__m128i a, __m128i b) 00700 { 00701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 00702 } 00703 00704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00705 _mm_min_epu8(__m128i a, __m128i b) 00706 { 00707 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 00708 } 00709 00710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00711 _mm_mulhi_epi16(__m128i a, __m128i b) 00712 { 00713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 00714 } 00715 00716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00717 _mm_mulhi_epu16(__m128i a, __m128i b) 00718 { 00719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 00720 } 00721 00722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00723 _mm_mullo_epi16(__m128i a, __m128i b) 00724 { 00725 return (__m128i)((__v8hi)a * (__v8hi)b); 00726 } 00727 00728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00729 _mm_mul_su32(__m64 a, __m64 b) 00730 { 00731 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 00732 } 00733 00734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00735 _mm_mul_epu32(__m128i a, __m128i b) 00736 { 00737 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 00738 } 00739 00740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00741 _mm_sad_epu8(__m128i a, __m128i b) 00742 { 00743 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 00744 } 00745 00746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00747 _mm_sub_epi8(__m128i a, __m128i b) 00748 { 00749 return (__m128i)((__v16qi)a - (__v16qi)b); 00750 } 00751 00752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00753 _mm_sub_epi16(__m128i a, __m128i b) 00754 { 00755 return (__m128i)((__v8hi)a - (__v8hi)b); 00756 } 00757 00758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00759 _mm_sub_epi32(__m128i a, __m128i b) 00760 { 00761 return (__m128i)((__v4si)a - (__v4si)b); 00762 } 00763 00764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 00765 _mm_sub_si64(__m64 a, __m64 b) 00766 { 00767 return a - b; 00768 } 00769 00770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00771 _mm_sub_epi64(__m128i a, __m128i b) 00772 { 00773 return a - b; 00774 } 00775 00776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00777 _mm_subs_epi8(__m128i a, __m128i b) 00778 { 00779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 00780 } 00781 00782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00783 _mm_subs_epi16(__m128i a, __m128i b) 00784 { 00785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 00786 } 00787 00788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00789 _mm_subs_epu8(__m128i a, __m128i b) 00790 { 00791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 00792 } 00793 00794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00795 _mm_subs_epu16(__m128i a, __m128i b) 00796 { 00797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 00798 } 00799 00800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00801 _mm_and_si128(__m128i a, __m128i b) 00802 { 00803 return a & b; 00804 } 00805 00806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00807 _mm_andnot_si128(__m128i a, __m128i b) 00808 { 00809 return ~a & b; 00810 } 00811 00812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00813 _mm_or_si128(__m128i a, __m128i b) 00814 { 00815 return a | b; 00816 } 00817 00818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00819 _mm_xor_si128(__m128i a, __m128i b) 00820 { 00821 return a ^ b; 00822 } 00823 00824 #define _mm_slli_si128(a, count) __extension__ ({ \ 00825 __m128i __a = (a); \ 00826 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 00827 00828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00829 _mm_slli_epi16(__m128i a, int count) 00830 { 00831 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 00832 } 00833 00834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00835 _mm_sll_epi16(__m128i a, __m128i count) 00836 { 00837 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 00838 } 00839 00840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00841 _mm_slli_epi32(__m128i a, int count) 00842 { 00843 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 00844 } 00845 00846 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00847 _mm_sll_epi32(__m128i a, __m128i count) 00848 { 00849 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 00850 } 00851 00852 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00853 _mm_slli_epi64(__m128i a, int count) 00854 { 00855 return __builtin_ia32_psllqi128(a, count); 00856 } 00857 00858 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00859 _mm_sll_epi64(__m128i a, __m128i count) 00860 { 00861 return __builtin_ia32_psllq128(a, count); 00862 } 00863 00864 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00865 _mm_srai_epi16(__m128i a, int count) 00866 { 00867 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 00868 } 00869 00870 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00871 _mm_sra_epi16(__m128i a, __m128i count) 00872 { 00873 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 00874 } 00875 00876 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00877 _mm_srai_epi32(__m128i a, int count) 00878 { 00879 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 00880 } 00881 00882 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00883 _mm_sra_epi32(__m128i a, __m128i count) 00884 { 00885 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 00886 } 00887 00888 00889 #define _mm_srli_si128(a, count) __extension__ ({ \ 00890 __m128i __a = (a); \ 00891 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 00892 00893 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00894 _mm_srli_epi16(__m128i a, int count) 00895 { 00896 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 00897 } 00898 00899 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00900 _mm_srl_epi16(__m128i a, __m128i count) 00901 { 00902 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 00903 } 00904 00905 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00906 _mm_srli_epi32(__m128i a, int count) 00907 { 00908 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 00909 } 00910 00911 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00912 _mm_srl_epi32(__m128i a, __m128i count) 00913 { 00914 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 00915 } 00916 00917 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00918 _mm_srli_epi64(__m128i a, int count) 00919 { 00920 return __builtin_ia32_psrlqi128(a, count); 00921 } 00922 00923 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00924 _mm_srl_epi64(__m128i a, __m128i count) 00925 { 00926 return __builtin_ia32_psrlq128(a, count); 00927 } 00928 00929 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00930 _mm_cmpeq_epi8(__m128i a, __m128i b) 00931 { 00932 return (__m128i)((__v16qi)a == (__v16qi)b); 00933 } 00934 00935 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00936 _mm_cmpeq_epi16(__m128i a, __m128i b) 00937 { 00938 return (__m128i)((__v8hi)a == (__v8hi)b); 00939 } 00940 00941 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00942 _mm_cmpeq_epi32(__m128i a, __m128i b) 00943 { 00944 return (__m128i)((__v4si)a == (__v4si)b); 00945 } 00946 00947 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00948 _mm_cmpgt_epi8(__m128i a, __m128i b) 00949 { 00950 /* This function always performs a signed comparison, but __v16qi is a char 00951 which may be signed or unsigned. */ 00952 typedef signed char __v16qs __attribute__((__vector_size__(16))); 00953 return (__m128i)((__v16qs)a > (__v16qs)b); 00954 } 00955 00956 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00957 _mm_cmpgt_epi16(__m128i a, __m128i b) 00958 { 00959 return (__m128i)((__v8hi)a > (__v8hi)b); 00960 } 00961 00962 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00963 _mm_cmpgt_epi32(__m128i a, __m128i b) 00964 { 00965 return (__m128i)((__v4si)a > (__v4si)b); 00966 } 00967 00968 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00969 _mm_cmplt_epi8(__m128i a, __m128i b) 00970 { 00971 return _mm_cmpgt_epi8(b,a); 00972 } 00973 00974 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00975 _mm_cmplt_epi16(__m128i a, __m128i b) 00976 { 00977 return _mm_cmpgt_epi16(b,a); 00978 } 00979 00980 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 00981 _mm_cmplt_epi32(__m128i a, __m128i b) 00982 { 00983 return _mm_cmpgt_epi32(b,a); 00984 } 00985 00986 #ifdef __x86_64__ 00987 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 00988 _mm_cvtsi64_sd(__m128d a, long long b) 00989 { 00990 a[0] = b; 00991 return a; 00992 } 00993 00994 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 00995 _mm_cvtsd_si64(__m128d a) 00996 { 00997 return __builtin_ia32_cvtsd2si64(a); 00998 } 00999 01000 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 01001 _mm_cvttsd_si64(__m128d a) 01002 { 01003 return a[0]; 01004 } 01005 #endif 01006 01007 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01008 _mm_cvtepi32_ps(__m128i a) 01009 { 01010 return __builtin_ia32_cvtdq2ps((__v4si)a); 01011 } 01012 01013 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01014 _mm_cvtps_epi32(__m128 a) 01015 { 01016 return (__m128i)__builtin_ia32_cvtps2dq(a); 01017 } 01018 01019 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01020 _mm_cvttps_epi32(__m128 a) 01021 { 01022 return (__m128i)__builtin_ia32_cvttps2dq(a); 01023 } 01024 01025 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01026 _mm_cvtsi32_si128(int a) 01027 { 01028 return (__m128i)(__v4si){ a, 0, 0, 0 }; 01029 } 01030 01031 #ifdef __x86_64__ 01032 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01033 _mm_cvtsi64_si128(long long a) 01034 { 01035 return (__m128i){ a, 0 }; 01036 } 01037 #endif 01038 01039 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01040 _mm_cvtsi128_si32(__m128i a) 01041 { 01042 __v4si b = (__v4si)a; 01043 return b[0]; 01044 } 01045 01046 #ifdef __x86_64__ 01047 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 01048 _mm_cvtsi128_si64(__m128i a) 01049 { 01050 return a[0]; 01051 } 01052 #endif 01053 01054 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01055 _mm_load_si128(__m128i const *p) 01056 { 01057 return *p; 01058 } 01059 01060 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01061 _mm_loadu_si128(__m128i const *p) 01062 { 01063 struct __loadu_si128 { 01064 __m128i v; 01065 } __attribute__((packed, may_alias)); 01066 return ((struct __loadu_si128*)p)->v; 01067 } 01068 01069 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01070 _mm_loadl_epi64(__m128i const *p) 01071 { 01072 struct __mm_loadl_epi64_struct { 01073 long long u; 01074 } __attribute__((__packed__, __may_alias__)); 01075 return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0}; 01076 } 01077 01078 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01079 _mm_set_epi64x(long long q1, long long q0) 01080 { 01081 return (__m128i){ q0, q1 }; 01082 } 01083 01084 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01085 _mm_set_epi64(__m64 q1, __m64 q0) 01086 { 01087 return (__m128i){ (long long)q0, (long long)q1 }; 01088 } 01089 01090 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01091 _mm_set_epi32(int i3, int i2, int i1, int i0) 01092 { 01093 return (__m128i)(__v4si){ i0, i1, i2, i3}; 01094 } 01095 01096 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01097 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 01098 { 01099 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 01100 } 01101 01102 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01103 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 01104 { 01105 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 01106 } 01107 01108 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01109 _mm_set1_epi64x(long long q) 01110 { 01111 return (__m128i){ q, q }; 01112 } 01113 01114 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01115 _mm_set1_epi64(__m64 q) 01116 { 01117 return (__m128i){ (long long)q, (long long)q }; 01118 } 01119 01120 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01121 _mm_set1_epi32(int i) 01122 { 01123 return (__m128i)(__v4si){ i, i, i, i }; 01124 } 01125 01126 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01127 _mm_set1_epi16(short w) 01128 { 01129 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 01130 } 01131 01132 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01133 _mm_set1_epi8(char b) 01134 { 01135 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 01136 } 01137 01138 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01139 _mm_setr_epi64(__m64 q0, __m64 q1) 01140 { 01141 return (__m128i){ (long long)q0, (long long)q1 }; 01142 } 01143 01144 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01145 _mm_setr_epi32(int i0, int i1, int i2, int i3) 01146 { 01147 return (__m128i)(__v4si){ i0, i1, i2, i3}; 01148 } 01149 01150 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01151 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 01152 { 01153 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 01154 } 01155 01156 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01157 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 01158 { 01159 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 01160 } 01161 01162 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01163 _mm_setzero_si128(void) 01164 { 01165 return (__m128i){ 0LL, 0LL }; 01166 } 01167 01168 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01169 _mm_store_si128(__m128i *p, __m128i b) 01170 { 01171 *p = b; 01172 } 01173 01174 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01175 _mm_storeu_si128(__m128i *p, __m128i b) 01176 { 01177 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 01178 } 01179 01180 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01181 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 01182 { 01183 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 01184 } 01185 01186 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01187 _mm_storel_epi64(__m128i *p, __m128i a) 01188 { 01189 struct __mm_storel_epi64_struct { 01190 long long u; 01191 } __attribute__((__packed__, __may_alias__)); 01192 ((struct __mm_storel_epi64_struct*)p)->u = a[0]; 01193 } 01194 01195 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01196 _mm_stream_pd(double *p, __m128d a) 01197 { 01198 __builtin_ia32_movntpd(p, a); 01199 } 01200 01201 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01202 _mm_stream_si128(__m128i *p, __m128i a) 01203 { 01204 __builtin_ia32_movntdq(p, a); 01205 } 01206 01207 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01208 _mm_stream_si32(int *p, int a) 01209 { 01210 __builtin_ia32_movnti(p, a); 01211 } 01212 01213 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01214 _mm_clflush(void const *p) 01215 { 01216 __builtin_ia32_clflush(p); 01217 } 01218 01219 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01220 _mm_lfence(void) 01221 { 01222 __builtin_ia32_lfence(); 01223 } 01224 01225 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01226 _mm_mfence(void) 01227 { 01228 __builtin_ia32_mfence(); 01229 } 01230 01231 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01232 _mm_packs_epi16(__m128i a, __m128i b) 01233 { 01234 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 01235 } 01236 01237 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01238 _mm_packs_epi32(__m128i a, __m128i b) 01239 { 01240 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 01241 } 01242 01243 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01244 _mm_packus_epi16(__m128i a, __m128i b) 01245 { 01246 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 01247 } 01248 01249 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01250 _mm_extract_epi16(__m128i a, int imm) 01251 { 01252 __v8hi b = (__v8hi)a; 01253 return (unsigned short)b[imm]; 01254 } 01255 01256 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01257 _mm_insert_epi16(__m128i a, int b, int imm) 01258 { 01259 __v8hi c = (__v8hi)a; 01260 c[imm & 7] = b; 01261 return (__m128i)c; 01262 } 01263 01264 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01265 _mm_movemask_epi8(__m128i a) 01266 { 01267 return __builtin_ia32_pmovmskb128((__v16qi)a); 01268 } 01269 01270 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 01271 __m128i __a = (a); \ 01272 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 01273 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 01274 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 01275 01276 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 01277 __m128i __a = (a); \ 01278 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 01279 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 01280 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 01281 4, 5, 6, 7); }) 01282 01283 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 01284 __m128i __a = (a); \ 01285 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 01286 0, 1, 2, 3, \ 01287 4 + (((imm) & 0x03) >> 0), \ 01288 4 + (((imm) & 0x0c) >> 2), \ 01289 4 + (((imm) & 0x30) >> 4), \ 01290 4 + (((imm) & 0xc0) >> 6)); }) 01291 01292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01293 _mm_unpackhi_epi8(__m128i a, __m128i b) 01294 { 01295 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 01296 } 01297 01298 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01299 _mm_unpackhi_epi16(__m128i a, __m128i b) 01300 { 01301 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 01302 } 01303 01304 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01305 _mm_unpackhi_epi32(__m128i a, __m128i b) 01306 { 01307 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 01308 } 01309 01310 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01311 _mm_unpackhi_epi64(__m128i a, __m128i b) 01312 { 01313 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 01314 } 01315 01316 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01317 _mm_unpacklo_epi8(__m128i a, __m128i b) 01318 { 01319 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 01320 } 01321 01322 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01323 _mm_unpacklo_epi16(__m128i a, __m128i b) 01324 { 01325 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 01326 } 01327 01328 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01329 _mm_unpacklo_epi32(__m128i a, __m128i b) 01330 { 01331 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 01332 } 01333 01334 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01335 _mm_unpacklo_epi64(__m128i a, __m128i b) 01336 { 01337 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 01338 } 01339 01340 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 01341 _mm_movepi64_pi64(__m128i a) 01342 { 01343 return (__m64)a[0]; 01344 } 01345 01346 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01347 _mm_movpi64_pi64(__m64 a) 01348 { 01349 return (__m128i){ (long long)a, 0 }; 01350 } 01351 01352 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01353 _mm_move_epi64(__m128i a) 01354 { 01355 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 01356 } 01357 01358 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01359 _mm_unpackhi_pd(__m128d a, __m128d b) 01360 { 01361 return __builtin_shufflevector(a, b, 1, 2+1); 01362 } 01363 01364 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01365 _mm_unpacklo_pd(__m128d a, __m128d b) 01366 { 01367 return __builtin_shufflevector(a, b, 0, 2+0); 01368 } 01369 01370 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 01371 _mm_movemask_pd(__m128d a) 01372 { 01373 return __builtin_ia32_movmskpd(a); 01374 } 01375 01376 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 01377 __m128d __a = (a); \ 01378 __m128d __b = (b); \ 01379 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 01380 01381 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01382 _mm_castpd_ps(__m128d in) 01383 { 01384 return (__m128)in; 01385 } 01386 01387 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01388 _mm_castpd_si128(__m128d in) 01389 { 01390 return (__m128i)in; 01391 } 01392 01393 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01394 _mm_castps_pd(__m128 in) 01395 { 01396 return (__m128d)in; 01397 } 01398 01399 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 01400 _mm_castps_si128(__m128 in) 01401 { 01402 return (__m128i)in; 01403 } 01404 01405 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 01406 _mm_castsi128_ps(__m128i in) 01407 { 01408 return (__m128)in; 01409 } 01410 01411 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 01412 _mm_castsi128_pd(__m128i in) 01413 { 01414 return (__m128d)in; 01415 } 01416 01417 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 01418 _mm_pause(void) 01419 { 01420 __asm__ volatile ("pause"); 01421 } 01422 01423 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 01424 01425 #endif /* __SSE2__ */ 01426 01427 #endif /* __EMMINTRIN_H */