clang API Documentation
00001 /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 00002 * 00003 * Permission is hereby granted, free of charge, to any person obtaining a copy 00004 * of this software and associated documentation files (the "Software"), to deal 00005 * in the Software without restriction, including without limitation the rights 00006 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00007 * copies of the Software, and to permit persons to whom the Software is 00008 * furnished to do so, subject to the following conditions: 00009 * 00010 * The above copyright notice and this permission notice shall be included in 00011 * all copies or substantial portions of the Software. 00012 * 00013 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00014 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00015 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00016 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00017 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00018 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00019 * THE SOFTWARE. 00020 * 00021 *===-----------------------------------------------------------------------=== 00022 */ 00023 00024 #ifndef __IMMINTRIN_H 00025 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 00026 #endif 00027 00028 typedef double __v4df __attribute__ ((__vector_size__ (32))); 00029 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 00030 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 00031 typedef int __v8si __attribute__ ((__vector_size__ (32))); 00032 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 00033 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 00034 00035 typedef float __m256 __attribute__ ((__vector_size__ (32))); 00036 typedef double __m256d __attribute__((__vector_size__(32))); 00037 typedef long long __m256i __attribute__((__vector_size__(32))); 00038 00039 /* Arithmetic */ 00040 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00041 _mm256_add_pd(__m256d a, __m256d b) 00042 { 00043 return a+b; 00044 } 00045 00046 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00047 _mm256_add_ps(__m256 a, __m256 b) 00048 { 00049 return a+b; 00050 } 00051 00052 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00053 _mm256_sub_pd(__m256d a, __m256d b) 00054 { 00055 return a-b; 00056 } 00057 00058 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00059 _mm256_sub_ps(__m256 a, __m256 b) 00060 { 00061 return a-b; 00062 } 00063 00064 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00065 _mm256_addsub_pd(__m256d a, __m256d b) 00066 { 00067 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 00068 } 00069 00070 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00071 _mm256_addsub_ps(__m256 a, __m256 b) 00072 { 00073 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 00074 } 00075 00076 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00077 _mm256_div_pd(__m256d a, __m256d b) 00078 { 00079 return a / b; 00080 } 00081 00082 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00083 _mm256_div_ps(__m256 a, __m256 b) 00084 { 00085 return a / b; 00086 } 00087 00088 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00089 _mm256_max_pd(__m256d a, __m256d b) 00090 { 00091 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 00092 } 00093 00094 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00095 _mm256_max_ps(__m256 a, __m256 b) 00096 { 00097 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 00098 } 00099 00100 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00101 _mm256_min_pd(__m256d a, __m256d b) 00102 { 00103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 00104 } 00105 00106 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00107 _mm256_min_ps(__m256 a, __m256 b) 00108 { 00109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 00110 } 00111 00112 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00113 _mm256_mul_pd(__m256d a, __m256d b) 00114 { 00115 return a * b; 00116 } 00117 00118 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00119 _mm256_mul_ps(__m256 a, __m256 b) 00120 { 00121 return a * b; 00122 } 00123 00124 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00125 _mm256_sqrt_pd(__m256d a) 00126 { 00127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 00128 } 00129 00130 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00131 _mm256_sqrt_ps(__m256 a) 00132 { 00133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 00134 } 00135 00136 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00137 _mm256_rsqrt_ps(__m256 a) 00138 { 00139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 00140 } 00141 00142 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00143 _mm256_rcp_ps(__m256 a) 00144 { 00145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 00146 } 00147 00148 #define _mm256_round_pd(V, M) __extension__ ({ \ 00149 __m256d __V = (V); \ 00150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) 00151 00152 #define _mm256_round_ps(V, M) __extension__ ({ \ 00153 __m256 __V = (V); \ 00154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) 00155 00156 #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 00157 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 00158 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 00159 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 00160 00161 /* Logical */ 00162 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00163 _mm256_and_pd(__m256d a, __m256d b) 00164 { 00165 return (__m256d)((__v4di)a & (__v4di)b); 00166 } 00167 00168 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00169 _mm256_and_ps(__m256 a, __m256 b) 00170 { 00171 return (__m256)((__v8si)a & (__v8si)b); 00172 } 00173 00174 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00175 _mm256_andnot_pd(__m256d a, __m256d b) 00176 { 00177 return (__m256d)(~(__v4di)a & (__v4di)b); 00178 } 00179 00180 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00181 _mm256_andnot_ps(__m256 a, __m256 b) 00182 { 00183 return (__m256)(~(__v8si)a & (__v8si)b); 00184 } 00185 00186 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00187 _mm256_or_pd(__m256d a, __m256d b) 00188 { 00189 return (__m256d)((__v4di)a | (__v4di)b); 00190 } 00191 00192 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00193 _mm256_or_ps(__m256 a, __m256 b) 00194 { 00195 return (__m256)((__v8si)a | (__v8si)b); 00196 } 00197 00198 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00199 _mm256_xor_pd(__m256d a, __m256d b) 00200 { 00201 return (__m256d)((__v4di)a ^ (__v4di)b); 00202 } 00203 00204 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00205 _mm256_xor_ps(__m256 a, __m256 b) 00206 { 00207 return (__m256)((__v8si)a ^ (__v8si)b); 00208 } 00209 00210 /* Horizontal arithmetic */ 00211 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00212 _mm256_hadd_pd(__m256d a, __m256d b) 00213 { 00214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 00215 } 00216 00217 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00218 _mm256_hadd_ps(__m256 a, __m256 b) 00219 { 00220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 00221 } 00222 00223 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00224 _mm256_hsub_pd(__m256d a, __m256d b) 00225 { 00226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 00227 } 00228 00229 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00230 _mm256_hsub_ps(__m256 a, __m256 b) 00231 { 00232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 00233 } 00234 00235 /* Vector permutations */ 00236 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 00237 _mm_permutevar_pd(__m128d a, __m128i c) 00238 { 00239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 00240 } 00241 00242 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00243 _mm256_permutevar_pd(__m256d a, __m256i c) 00244 { 00245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 00246 } 00247 00248 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00249 _mm_permutevar_ps(__m128 a, __m128i c) 00250 { 00251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 00252 } 00253 00254 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00255 _mm256_permutevar_ps(__m256 a, __m256i c) 00256 { 00257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 00258 (__v8si)c); 00259 } 00260 00261 #define _mm_permute_pd(A, C) __extension__ ({ \ 00262 __m128d __A = (A); \ 00263 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ 00264 (C) & 0x1, ((C) & 0x2) >> 1); }) 00265 00266 #define _mm256_permute_pd(A, C) __extension__ ({ \ 00267 __m256d __A = (A); \ 00268 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ 00269 (C) & 0x1, ((C) & 0x2) >> 1, \ 00270 2 + (((C) & 0x4) >> 2), \ 00271 2 + (((C) & 0x8) >> 3)); }) 00272 00273 #define _mm_permute_ps(A, C) __extension__ ({ \ 00274 __m128 __A = (A); \ 00275 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ 00276 (C) & 0x3, ((C) & 0xc) >> 2, \ 00277 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 00278 00279 #define _mm256_permute_ps(A, C) __extension__ ({ \ 00280 __m256 __A = (A); \ 00281 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ 00282 (C) & 0x3, ((C) & 0xc) >> 2, \ 00283 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 00284 4 + (((C) & 0x03) >> 0), \ 00285 4 + (((C) & 0x0c) >> 2), \ 00286 4 + (((C) & 0x30) >> 4), \ 00287 4 + (((C) & 0xc0) >> 6)); }) 00288 00289 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 00290 __m256d __V1 = (V1); \ 00291 __m256d __V2 = (V2); \ 00292 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) 00293 00294 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 00295 __m256 __V1 = (V1); \ 00296 __m256 __V2 = (V2); \ 00297 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 00298 00299 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 00300 __m256i __V1 = (V1); \ 00301 __m256i __V2 = (V2); \ 00302 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) 00303 00304 /* Vector Blend */ 00305 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 00306 __m256d __V1 = (V1); \ 00307 __m256d __V2 = (V2); \ 00308 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); }) 00309 00310 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 00311 __m256 __V1 = (V1); \ 00312 __m256 __V2 = (V2); \ 00313 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 00314 00315 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00316 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 00317 { 00318 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 00319 } 00320 00321 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00322 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 00323 { 00324 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 00325 } 00326 00327 /* Vector Dot Product */ 00328 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 00329 __m256 __V1 = (V1); \ 00330 __m256 __V2 = (V2); \ 00331 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 00332 00333 /* Vector shuffle */ 00334 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 00335 __m256 __a = (a); \ 00336 __m256 __b = (b); \ 00337 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 00338 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 00339 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 00340 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 00341 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 00342 00343 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 00344 __m256d __a = (a); \ 00345 __m256d __b = (b); \ 00346 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 00347 (mask) & 0x1, \ 00348 (((mask) & 0x2) >> 1) + 4, \ 00349 (((mask) & 0x4) >> 2) + 2, \ 00350 (((mask) & 0x8) >> 3) + 6); }) 00351 00352 /* Compare */ 00353 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 00354 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 00355 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 00356 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 00357 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 00358 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 00359 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 00360 #define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 00361 #define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 00362 #define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 00363 #define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 00364 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 00365 #define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 00366 #define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 00367 #define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 00368 #define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 00369 #define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 00370 #define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 00371 #define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 00372 #define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 00373 #define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 00374 #define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 00375 #define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 00376 #define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 00377 #define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 00378 #define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 00379 #define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 00380 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 00381 #define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 00382 #define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 00383 #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 00384 #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 00385 00386 #define _mm_cmp_pd(a, b, c) __extension__ ({ \ 00387 __m128d __a = (a); \ 00388 __m128d __b = (b); \ 00389 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 00390 00391 #define _mm_cmp_ps(a, b, c) __extension__ ({ \ 00392 __m128 __a = (a); \ 00393 __m128 __b = (b); \ 00394 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 00395 00396 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 00397 __m256d __a = (a); \ 00398 __m256d __b = (b); \ 00399 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 00400 00401 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 00402 __m256 __a = (a); \ 00403 __m256 __b = (b); \ 00404 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 00405 00406 #define _mm_cmp_sd(a, b, c) __extension__ ({ \ 00407 __m128d __a = (a); \ 00408 __m128d __b = (b); \ 00409 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 00410 00411 #define _mm_cmp_ss(a, b, c) __extension__ ({ \ 00412 __m128 __a = (a); \ 00413 __m128 __b = (b); \ 00414 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 00415 00416 /* Vector extract */ 00417 #define _mm256_extractf128_pd(A, O) __extension__ ({ \ 00418 __m256d __A = (A); \ 00419 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) 00420 00421 #define _mm256_extractf128_ps(A, O) __extension__ ({ \ 00422 __m256 __A = (A); \ 00423 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) 00424 00425 #define _mm256_extractf128_si256(A, O) __extension__ ({ \ 00426 __m256i __A = (A); \ 00427 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) 00428 00429 static __inline int __attribute__((__always_inline__, __nodebug__)) 00430 _mm256_extract_epi32(__m256i a, int const imm) 00431 { 00432 __v8si b = (__v8si)a; 00433 return b[imm]; 00434 } 00435 00436 static __inline int __attribute__((__always_inline__, __nodebug__)) 00437 _mm256_extract_epi16(__m256i a, int const imm) 00438 { 00439 __v16hi b = (__v16hi)a; 00440 return b[imm]; 00441 } 00442 00443 static __inline int __attribute__((__always_inline__, __nodebug__)) 00444 _mm256_extract_epi8(__m256i a, int const imm) 00445 { 00446 __v32qi b = (__v32qi)a; 00447 return b[imm]; 00448 } 00449 00450 #ifdef __x86_64__ 00451 static __inline long long __attribute__((__always_inline__, __nodebug__)) 00452 _mm256_extract_epi64(__m256i a, const int imm) 00453 { 00454 __v4di b = (__v4di)a; 00455 return b[imm]; 00456 } 00457 #endif 00458 00459 /* Vector insert */ 00460 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ 00461 __m256d __V1 = (V1); \ 00462 __m128d __V2 = (V2); \ 00463 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) 00464 00465 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ 00466 __m256 __V1 = (V1); \ 00467 __m128 __V2 = (V2); \ 00468 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) 00469 00470 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ 00471 __m256i __V1 = (V1); \ 00472 __m128i __V2 = (V2); \ 00473 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) 00474 00475 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00476 _mm256_insert_epi32(__m256i a, int b, int const imm) 00477 { 00478 __v8si c = (__v8si)a; 00479 c[imm & 7] = b; 00480 return (__m256i)c; 00481 } 00482 00483 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00484 _mm256_insert_epi16(__m256i a, int b, int const imm) 00485 { 00486 __v16hi c = (__v16hi)a; 00487 c[imm & 15] = b; 00488 return (__m256i)c; 00489 } 00490 00491 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00492 _mm256_insert_epi8(__m256i a, int b, int const imm) 00493 { 00494 __v32qi c = (__v32qi)a; 00495 c[imm & 31] = b; 00496 return (__m256i)c; 00497 } 00498 00499 #ifdef __x86_64__ 00500 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00501 _mm256_insert_epi64(__m256i a, int b, int const imm) 00502 { 00503 __v4di c = (__v4di)a; 00504 c[imm & 3] = b; 00505 return (__m256i)c; 00506 } 00507 #endif 00508 00509 /* Conversion */ 00510 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00511 _mm256_cvtepi32_pd(__m128i a) 00512 { 00513 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 00514 } 00515 00516 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00517 _mm256_cvtepi32_ps(__m256i a) 00518 { 00519 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 00520 } 00521 00522 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00523 _mm256_cvtpd_ps(__m256d a) 00524 { 00525 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 00526 } 00527 00528 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00529 _mm256_cvtps_epi32(__m256 a) 00530 { 00531 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 00532 } 00533 00534 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00535 _mm256_cvtps_pd(__m128 a) 00536 { 00537 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 00538 } 00539 00540 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 00541 _mm256_cvttpd_epi32(__m256d a) 00542 { 00543 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 00544 } 00545 00546 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 00547 _mm256_cvtpd_epi32(__m256d a) 00548 { 00549 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 00550 } 00551 00552 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00553 _mm256_cvttps_epi32(__m256 a) 00554 { 00555 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 00556 } 00557 00558 /* Vector replicate */ 00559 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00560 _mm256_movehdup_ps(__m256 a) 00561 { 00562 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 00563 } 00564 00565 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00566 _mm256_moveldup_ps(__m256 a) 00567 { 00568 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 00569 } 00570 00571 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00572 _mm256_movedup_pd(__m256d a) 00573 { 00574 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 00575 } 00576 00577 /* Unpack and Interleave */ 00578 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00579 _mm256_unpackhi_pd(__m256d a, __m256d b) 00580 { 00581 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 00582 } 00583 00584 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00585 _mm256_unpacklo_pd(__m256d a, __m256d b) 00586 { 00587 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 00588 } 00589 00590 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00591 _mm256_unpackhi_ps(__m256 a, __m256 b) 00592 { 00593 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 00594 } 00595 00596 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00597 _mm256_unpacklo_ps(__m256 a, __m256 b) 00598 { 00599 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 00600 } 00601 00602 /* Bit Test */ 00603 static __inline int __attribute__((__always_inline__, __nodebug__)) 00604 _mm_testz_pd(__m128d a, __m128d b) 00605 { 00606 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 00607 } 00608 00609 static __inline int __attribute__((__always_inline__, __nodebug__)) 00610 _mm_testc_pd(__m128d a, __m128d b) 00611 { 00612 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 00613 } 00614 00615 static __inline int __attribute__((__always_inline__, __nodebug__)) 00616 _mm_testnzc_pd(__m128d a, __m128d b) 00617 { 00618 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 00619 } 00620 00621 static __inline int __attribute__((__always_inline__, __nodebug__)) 00622 _mm_testz_ps(__m128 a, __m128 b) 00623 { 00624 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 00625 } 00626 00627 static __inline int __attribute__((__always_inline__, __nodebug__)) 00628 _mm_testc_ps(__m128 a, __m128 b) 00629 { 00630 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 00631 } 00632 00633 static __inline int __attribute__((__always_inline__, __nodebug__)) 00634 _mm_testnzc_ps(__m128 a, __m128 b) 00635 { 00636 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 00637 } 00638 00639 static __inline int __attribute__((__always_inline__, __nodebug__)) 00640 _mm256_testz_pd(__m256d a, __m256d b) 00641 { 00642 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 00643 } 00644 00645 static __inline int __attribute__((__always_inline__, __nodebug__)) 00646 _mm256_testc_pd(__m256d a, __m256d b) 00647 { 00648 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 00649 } 00650 00651 static __inline int __attribute__((__always_inline__, __nodebug__)) 00652 _mm256_testnzc_pd(__m256d a, __m256d b) 00653 { 00654 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 00655 } 00656 00657 static __inline int __attribute__((__always_inline__, __nodebug__)) 00658 _mm256_testz_ps(__m256 a, __m256 b) 00659 { 00660 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 00661 } 00662 00663 static __inline int __attribute__((__always_inline__, __nodebug__)) 00664 _mm256_testc_ps(__m256 a, __m256 b) 00665 { 00666 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 00667 } 00668 00669 static __inline int __attribute__((__always_inline__, __nodebug__)) 00670 _mm256_testnzc_ps(__m256 a, __m256 b) 00671 { 00672 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 00673 } 00674 00675 static __inline int __attribute__((__always_inline__, __nodebug__)) 00676 _mm256_testz_si256(__m256i a, __m256i b) 00677 { 00678 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 00679 } 00680 00681 static __inline int __attribute__((__always_inline__, __nodebug__)) 00682 _mm256_testc_si256(__m256i a, __m256i b) 00683 { 00684 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 00685 } 00686 00687 static __inline int __attribute__((__always_inline__, __nodebug__)) 00688 _mm256_testnzc_si256(__m256i a, __m256i b) 00689 { 00690 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 00691 } 00692 00693 /* Vector extract sign mask */ 00694 static __inline int __attribute__((__always_inline__, __nodebug__)) 00695 _mm256_movemask_pd(__m256d a) 00696 { 00697 return __builtin_ia32_movmskpd256((__v4df)a); 00698 } 00699 00700 static __inline int __attribute__((__always_inline__, __nodebug__)) 00701 _mm256_movemask_ps(__m256 a) 00702 { 00703 return __builtin_ia32_movmskps256((__v8sf)a); 00704 } 00705 00706 /* Vector zero */ 00707 static __inline void __attribute__((__always_inline__, __nodebug__)) 00708 _mm256_zeroall(void) 00709 { 00710 __builtin_ia32_vzeroall(); 00711 } 00712 00713 static __inline void __attribute__((__always_inline__, __nodebug__)) 00714 _mm256_zeroupper(void) 00715 { 00716 __builtin_ia32_vzeroupper(); 00717 } 00718 00719 /* Vector load with broadcast */ 00720 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00721 _mm_broadcast_ss(float const *a) 00722 { 00723 return (__m128)__builtin_ia32_vbroadcastss(a); 00724 } 00725 00726 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00727 _mm256_broadcast_sd(double const *a) 00728 { 00729 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 00730 } 00731 00732 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00733 _mm256_broadcast_ss(float const *a) 00734 { 00735 return (__m256)__builtin_ia32_vbroadcastss256(a); 00736 } 00737 00738 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00739 _mm256_broadcast_pd(__m128d const *a) 00740 { 00741 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 00742 } 00743 00744 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00745 _mm256_broadcast_ps(__m128 const *a) 00746 { 00747 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 00748 } 00749 00750 /* SIMD load ops */ 00751 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00752 _mm256_load_pd(double const *p) 00753 { 00754 return *(__m256d *)p; 00755 } 00756 00757 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00758 _mm256_load_ps(float const *p) 00759 { 00760 return *(__m256 *)p; 00761 } 00762 00763 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00764 _mm256_loadu_pd(double const *p) 00765 { 00766 struct __loadu_pd { 00767 __m256d v; 00768 } __attribute__((packed, may_alias)); 00769 return ((struct __loadu_pd*)p)->v; 00770 } 00771 00772 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00773 _mm256_loadu_ps(float const *p) 00774 { 00775 struct __loadu_ps { 00776 __m256 v; 00777 } __attribute__((packed, may_alias)); 00778 return ((struct __loadu_ps*)p)->v; 00779 } 00780 00781 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00782 _mm256_load_si256(__m256i const *p) 00783 { 00784 return *p; 00785 } 00786 00787 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00788 _mm256_loadu_si256(__m256i const *p) 00789 { 00790 struct __loadu_si256 { 00791 __m256i v; 00792 } __attribute__((packed, may_alias)); 00793 return ((struct __loadu_si256*)p)->v; 00794 } 00795 00796 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00797 _mm256_lddqu_si256(__m256i const *p) 00798 { 00799 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 00800 } 00801 00802 /* SIMD store ops */ 00803 static __inline void __attribute__((__always_inline__, __nodebug__)) 00804 _mm256_store_pd(double *p, __m256d a) 00805 { 00806 *(__m256d *)p = a; 00807 } 00808 00809 static __inline void __attribute__((__always_inline__, __nodebug__)) 00810 _mm256_store_ps(float *p, __m256 a) 00811 { 00812 *(__m256 *)p = a; 00813 } 00814 00815 static __inline void __attribute__((__always_inline__, __nodebug__)) 00816 _mm256_storeu_pd(double *p, __m256d a) 00817 { 00818 __builtin_ia32_storeupd256(p, (__v4df)a); 00819 } 00820 00821 static __inline void __attribute__((__always_inline__, __nodebug__)) 00822 _mm256_storeu_ps(float *p, __m256 a) 00823 { 00824 __builtin_ia32_storeups256(p, (__v8sf)a); 00825 } 00826 00827 static __inline void __attribute__((__always_inline__, __nodebug__)) 00828 _mm256_store_si256(__m256i *p, __m256i a) 00829 { 00830 *p = a; 00831 } 00832 00833 static __inline void __attribute__((__always_inline__, __nodebug__)) 00834 _mm256_storeu_si256(__m256i *p, __m256i a) 00835 { 00836 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 00837 } 00838 00839 /* Conditional load ops */ 00840 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 00841 _mm_maskload_pd(double const *p, __m128d m) 00842 { 00843 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 00844 } 00845 00846 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00847 _mm256_maskload_pd(double const *p, __m256d m) 00848 { 00849 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 00850 } 00851 00852 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 00853 _mm_maskload_ps(float const *p, __m128 m) 00854 { 00855 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 00856 } 00857 00858 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00859 _mm256_maskload_ps(float const *p, __m256 m) 00860 { 00861 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 00862 } 00863 00864 /* Conditional store ops */ 00865 static __inline void __attribute__((__always_inline__, __nodebug__)) 00866 _mm256_maskstore_ps(float *p, __m256 m, __m256 a) 00867 { 00868 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 00869 } 00870 00871 static __inline void __attribute__((__always_inline__, __nodebug__)) 00872 _mm_maskstore_pd(double *p, __m128d m, __m128d a) 00873 { 00874 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 00875 } 00876 00877 static __inline void __attribute__((__always_inline__, __nodebug__)) 00878 _mm256_maskstore_pd(double *p, __m256d m, __m256d a) 00879 { 00880 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 00881 } 00882 00883 static __inline void __attribute__((__always_inline__, __nodebug__)) 00884 _mm_maskstore_ps(float *p, __m128 m, __m128 a) 00885 { 00886 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 00887 } 00888 00889 /* Cacheability support ops */ 00890 static __inline void __attribute__((__always_inline__, __nodebug__)) 00891 _mm256_stream_si256(__m256i *a, __m256i b) 00892 { 00893 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 00894 } 00895 00896 static __inline void __attribute__((__always_inline__, __nodebug__)) 00897 _mm256_stream_pd(double *a, __m256d b) 00898 { 00899 __builtin_ia32_movntpd256(a, (__v4df)b); 00900 } 00901 00902 static __inline void __attribute__((__always_inline__, __nodebug__)) 00903 _mm256_stream_ps(float *p, __m256 a) 00904 { 00905 __builtin_ia32_movntps256(p, (__v8sf)a); 00906 } 00907 00908 /* Create vectors */ 00909 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00910 _mm256_set_pd(double a, double b, double c, double d) 00911 { 00912 return (__m256d){ d, c, b, a }; 00913 } 00914 00915 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00916 _mm256_set_ps(float a, float b, float c, float d, 00917 float e, float f, float g, float h) 00918 { 00919 return (__m256){ h, g, f, e, d, c, b, a }; 00920 } 00921 00922 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00923 _mm256_set_epi32(int i0, int i1, int i2, int i3, 00924 int i4, int i5, int i6, int i7) 00925 { 00926 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 00927 } 00928 00929 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00930 _mm256_set_epi16(short w15, short w14, short w13, short w12, 00931 short w11, short w10, short w09, short w08, 00932 short w07, short w06, short w05, short w04, 00933 short w03, short w02, short w01, short w00) 00934 { 00935 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 00936 w08, w09, w10, w11, w12, w13, w14, w15 }; 00937 } 00938 00939 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00940 _mm256_set_epi8(char b31, char b30, char b29, char b28, 00941 char b27, char b26, char b25, char b24, 00942 char b23, char b22, char b21, char b20, 00943 char b19, char b18, char b17, char b16, 00944 char b15, char b14, char b13, char b12, 00945 char b11, char b10, char b09, char b08, 00946 char b07, char b06, char b05, char b04, 00947 char b03, char b02, char b01, char b00) 00948 { 00949 return (__m256i)(__v32qi){ 00950 b00, b01, b02, b03, b04, b05, b06, b07, 00951 b08, b09, b10, b11, b12, b13, b14, b15, 00952 b16, b17, b18, b19, b20, b21, b22, b23, 00953 b24, b25, b26, b27, b28, b29, b30, b31 00954 }; 00955 } 00956 00957 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00958 _mm256_set_epi64x(long long a, long long b, long long c, long long d) 00959 { 00960 return (__m256i)(__v4di){ d, c, b, a }; 00961 } 00962 00963 /* Create vectors with elements in reverse order */ 00964 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 00965 _mm256_setr_pd(double a, double b, double c, double d) 00966 { 00967 return (__m256d){ a, b, c, d }; 00968 } 00969 00970 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 00971 _mm256_setr_ps(float a, float b, float c, float d, 00972 float e, float f, float g, float h) 00973 { 00974 return (__m256){ a, b, c, d, e, f, g, h }; 00975 } 00976 00977 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00978 _mm256_setr_epi32(int i0, int i1, int i2, int i3, 00979 int i4, int i5, int i6, int i7) 00980 { 00981 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 00982 } 00983 00984 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00985 _mm256_setr_epi16(short w15, short w14, short w13, short w12, 00986 short w11, short w10, short w09, short w08, 00987 short w07, short w06, short w05, short w04, 00988 short w03, short w02, short w01, short w00) 00989 { 00990 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 00991 w07, w06, w05, w04, w03, w02, w01, w00 }; 00992 } 00993 00994 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 00995 _mm256_setr_epi8(char b31, char b30, char b29, char b28, 00996 char b27, char b26, char b25, char b24, 00997 char b23, char b22, char b21, char b20, 00998 char b19, char b18, char b17, char b16, 00999 char b15, char b14, char b13, char b12, 01000 char b11, char b10, char b09, char b08, 01001 char b07, char b06, char b05, char b04, 01002 char b03, char b02, char b01, char b00) 01003 { 01004 return (__m256i)(__v32qi){ 01005 b31, b30, b29, b28, b27, b26, b25, b24, 01006 b23, b22, b21, b20, b19, b18, b17, b16, 01007 b15, b14, b13, b12, b11, b10, b09, b08, 01008 b07, b06, b05, b04, b03, b02, b01, b00 }; 01009 } 01010 01011 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01012 _mm256_setr_epi64x(long long a, long long b, long long c, long long d) 01013 { 01014 return (__m256i)(__v4di){ a, b, c, d }; 01015 } 01016 01017 /* Create vectors with repeated elements */ 01018 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01019 _mm256_set1_pd(double w) 01020 { 01021 return (__m256d){ w, w, w, w }; 01022 } 01023 01024 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01025 _mm256_set1_ps(float w) 01026 { 01027 return (__m256){ w, w, w, w, w, w, w, w }; 01028 } 01029 01030 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01031 _mm256_set1_epi32(int i) 01032 { 01033 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 01034 } 01035 01036 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01037 _mm256_set1_epi16(short w) 01038 { 01039 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 01040 } 01041 01042 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01043 _mm256_set1_epi8(char b) 01044 { 01045 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 01046 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 01047 } 01048 01049 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01050 _mm256_set1_epi64x(long long q) 01051 { 01052 return (__m256i)(__v4di){ q, q, q, q }; 01053 } 01054 01055 /* Create zeroed vectors */ 01056 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01057 _mm256_setzero_pd(void) 01058 { 01059 return (__m256d){ 0, 0, 0, 0 }; 01060 } 01061 01062 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01063 _mm256_setzero_ps(void) 01064 { 01065 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 01066 } 01067 01068 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01069 _mm256_setzero_si256(void) 01070 { 01071 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 01072 } 01073 01074 /* Cast between vector types */ 01075 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01076 _mm256_castpd_ps(__m256d in) 01077 { 01078 return (__m256)in; 01079 } 01080 01081 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01082 _mm256_castpd_si256(__m256d in) 01083 { 01084 return (__m256i)in; 01085 } 01086 01087 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01088 _mm256_castps_pd(__m256 in) 01089 { 01090 return (__m256d)in; 01091 } 01092 01093 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01094 _mm256_castps_si256(__m256 in) 01095 { 01096 return (__m256i)in; 01097 } 01098 01099 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01100 _mm256_castsi256_ps(__m256i in) 01101 { 01102 return (__m256)in; 01103 } 01104 01105 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01106 _mm256_castsi256_pd(__m256i in) 01107 { 01108 return (__m256d)in; 01109 } 01110 01111 static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 01112 _mm256_castpd256_pd128(__m256d in) 01113 { 01114 return __builtin_shufflevector(in, in, 0, 1); 01115 } 01116 01117 static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 01118 _mm256_castps256_ps128(__m256 in) 01119 { 01120 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 01121 } 01122 01123 static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 01124 _mm256_castsi256_si128(__m256i in) 01125 { 01126 return __builtin_shufflevector(in, in, 0, 1); 01127 } 01128 01129 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01130 _mm256_castpd128_pd256(__m128d in) 01131 { 01132 __m128d zero = _mm_setzero_pd(); 01133 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 01134 } 01135 01136 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01137 _mm256_castps128_ps256(__m128 in) 01138 { 01139 __m128 zero = _mm_setzero_ps(); 01140 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 01141 } 01142 01143 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01144 _mm256_castsi128_si256(__m128i in) 01145 { 01146 __m128i zero = _mm_setzero_si128(); 01147 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 01148 } 01149 01150 /* SIMD load ops (unaligned) */ 01151 static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 01152 _mm256_loadu2_m128(float const *addr_hi, float const *addr_lo) 01153 { 01154 struct __loadu_ps { 01155 __m128 v; 01156 } __attribute__((__packed__, __may_alias__)); 01157 01158 __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v); 01159 return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1); 01160 } 01161 01162 static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 01163 _mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo) 01164 { 01165 struct __loadu_pd { 01166 __m128d v; 01167 } __attribute__((__packed__, __may_alias__)); 01168 01169 __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v); 01170 return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1); 01171 } 01172 01173 static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 01174 _mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo) 01175 { 01176 struct __loadu_si128 { 01177 __m128i v; 01178 } __attribute__((packed, may_alias)); 01179 __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v); 01180 return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1); 01181 } 01182 01183 /* SIMD store ops (unaligned) */ 01184 static __inline void __attribute__((__always_inline__, __nodebug__)) 01185 _mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a) 01186 { 01187 __m128 v128; 01188 01189 v128 = _mm256_castps256_ps128(a); 01190 __builtin_ia32_storeups(addr_lo, v128); 01191 v128 = _mm256_extractf128_ps(a, 1); 01192 __builtin_ia32_storeups(addr_hi, v128); 01193 } 01194 01195 static __inline void __attribute__((__always_inline__, __nodebug__)) 01196 _mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a) 01197 { 01198 __m128d v128; 01199 01200 v128 = _mm256_castpd256_pd128(a); 01201 __builtin_ia32_storeupd(addr_lo, v128); 01202 v128 = _mm256_extractf128_pd(a, 1); 01203 __builtin_ia32_storeupd(addr_hi, v128); 01204 } 01205 01206 static __inline void __attribute__((__always_inline__, __nodebug__)) 01207 _mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a) 01208 { 01209 __m128i v128; 01210 01211 v128 = _mm256_castsi256_si128(a); 01212 __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128); 01213 v128 = _mm256_extractf128_si256(a, 1); 01214 __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128); 01215 }