clang API Documentation

avxintrin.h
Go to the documentation of this file.
00001 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
00002  *
00003  * Permission is hereby granted, free of charge, to any person obtaining a copy
00004  * of this software and associated documentation files (the "Software"), to deal
00005  * in the Software without restriction, including without limitation the rights
00006  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007  * copies of the Software, and to permit persons to whom the Software is
00008  * furnished to do so, subject to the following conditions:
00009  *
00010  * The above copyright notice and this permission notice shall be included in
00011  * all copies or substantial portions of the Software.
00012  *
00013  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019  * THE SOFTWARE.
00020  *
00021  *===-----------------------------------------------------------------------===
00022  */
00023 
00024 #ifndef __IMMINTRIN_H
00025 #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
00026 #endif
00027 
00028 typedef double __v4df __attribute__ ((__vector_size__ (32)));
00029 typedef float __v8sf __attribute__ ((__vector_size__ (32)));
00030 typedef long long __v4di __attribute__ ((__vector_size__ (32)));
00031 typedef int __v8si __attribute__ ((__vector_size__ (32)));
00032 typedef short __v16hi __attribute__ ((__vector_size__ (32)));
00033 typedef char __v32qi __attribute__ ((__vector_size__ (32)));
00034 
00035 typedef float __m256 __attribute__ ((__vector_size__ (32)));
00036 typedef double __m256d __attribute__((__vector_size__(32)));
00037 typedef long long __m256i __attribute__((__vector_size__(32)));
00038 
00039 /* Arithmetic */
00040 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00041 _mm256_add_pd(__m256d a, __m256d b)
00042 {
00043   return a+b;
00044 }
00045 
00046 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00047 _mm256_add_ps(__m256 a, __m256 b)
00048 {
00049   return a+b;
00050 }
00051 
00052 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00053 _mm256_sub_pd(__m256d a, __m256d b)
00054 {
00055   return a-b;
00056 }
00057 
00058 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00059 _mm256_sub_ps(__m256 a, __m256 b)
00060 {
00061   return a-b;
00062 }
00063 
00064 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00065 _mm256_addsub_pd(__m256d a, __m256d b)
00066 {
00067   return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b);
00068 }
00069 
00070 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00071 _mm256_addsub_ps(__m256 a, __m256 b)
00072 {
00073   return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b);
00074 }
00075 
00076 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00077 _mm256_div_pd(__m256d a, __m256d b)
00078 {
00079   return a / b;
00080 }
00081 
00082 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00083 _mm256_div_ps(__m256 a, __m256 b)
00084 {
00085   return a / b;
00086 }
00087 
00088 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00089 _mm256_max_pd(__m256d a, __m256d b)
00090 {
00091   return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b);
00092 }
00093 
00094 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00095 _mm256_max_ps(__m256 a, __m256 b)
00096 {
00097   return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b);
00098 }
00099 
00100 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00101 _mm256_min_pd(__m256d a, __m256d b)
00102 {
00103   return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b);
00104 }
00105 
00106 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00107 _mm256_min_ps(__m256 a, __m256 b)
00108 {
00109   return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b);
00110 }
00111 
00112 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00113 _mm256_mul_pd(__m256d a, __m256d b)
00114 {
00115   return a * b;
00116 }
00117 
00118 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00119 _mm256_mul_ps(__m256 a, __m256 b)
00120 {
00121   return a * b;
00122 }
00123 
00124 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00125 _mm256_sqrt_pd(__m256d a)
00126 {
00127   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a);
00128 }
00129 
00130 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00131 _mm256_sqrt_ps(__m256 a)
00132 {
00133   return (__m256)__builtin_ia32_sqrtps256((__v8sf)a);
00134 }
00135 
00136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00137 _mm256_rsqrt_ps(__m256 a)
00138 {
00139   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a);
00140 }
00141 
00142 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00143 _mm256_rcp_ps(__m256 a)
00144 {
00145   return (__m256)__builtin_ia32_rcpps256((__v8sf)a);
00146 }
00147 
00148 #define _mm256_round_pd(V, M) __extension__ ({ \
00149     __m256d __V = (V); \
00150     (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); })
00151 
00152 #define _mm256_round_ps(V, M) __extension__ ({ \
00153   __m256 __V = (V); \
00154   (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); })
00155 
00156 #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
00157 #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
00158 #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
00159 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
00160 
00161 /* Logical */
00162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00163 _mm256_and_pd(__m256d a, __m256d b)
00164 {
00165   return (__m256d)((__v4di)a & (__v4di)b);
00166 }
00167 
00168 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00169 _mm256_and_ps(__m256 a, __m256 b)
00170 {
00171   return (__m256)((__v8si)a & (__v8si)b);
00172 }
00173 
00174 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00175 _mm256_andnot_pd(__m256d a, __m256d b)
00176 {
00177   return (__m256d)(~(__v4di)a & (__v4di)b);
00178 }
00179 
00180 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00181 _mm256_andnot_ps(__m256 a, __m256 b)
00182 {
00183   return (__m256)(~(__v8si)a & (__v8si)b);
00184 }
00185 
00186 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00187 _mm256_or_pd(__m256d a, __m256d b)
00188 {
00189   return (__m256d)((__v4di)a | (__v4di)b);
00190 }
00191 
00192 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00193 _mm256_or_ps(__m256 a, __m256 b)
00194 {
00195   return (__m256)((__v8si)a | (__v8si)b);
00196 }
00197 
00198 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00199 _mm256_xor_pd(__m256d a, __m256d b)
00200 {
00201   return (__m256d)((__v4di)a ^ (__v4di)b);
00202 }
00203 
00204 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00205 _mm256_xor_ps(__m256 a, __m256 b)
00206 {
00207   return (__m256)((__v8si)a ^ (__v8si)b);
00208 }
00209 
00210 /* Horizontal arithmetic */
00211 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00212 _mm256_hadd_pd(__m256d a, __m256d b)
00213 {
00214   return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b);
00215 }
00216 
00217 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00218 _mm256_hadd_ps(__m256 a, __m256 b)
00219 {
00220   return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b);
00221 }
00222 
00223 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00224 _mm256_hsub_pd(__m256d a, __m256d b)
00225 {
00226   return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b);
00227 }
00228 
00229 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00230 _mm256_hsub_ps(__m256 a, __m256 b)
00231 {
00232   return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b);
00233 }
00234 
00235 /* Vector permutations */
00236 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
00237 _mm_permutevar_pd(__m128d a, __m128i c)
00238 {
00239   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c);
00240 }
00241 
00242 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00243 _mm256_permutevar_pd(__m256d a, __m256i c)
00244 {
00245   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c);
00246 }
00247 
00248 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00249 _mm_permutevar_ps(__m128 a, __m128i c)
00250 {
00251   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c);
00252 }
00253 
00254 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00255 _mm256_permutevar_ps(__m256 a, __m256i c)
00256 {
00257   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a,
00258               (__v8si)c);
00259 }
00260 
00261 #define _mm_permute_pd(A, C) __extension__ ({ \
00262   __m128d __A = (A); \
00263   (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \
00264                                    (C) & 0x1, ((C) & 0x2) >> 1); })
00265 
00266 #define _mm256_permute_pd(A, C) __extension__ ({ \
00267   __m256d __A = (A); \
00268   (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \
00269                                    (C) & 0x1, ((C) & 0x2) >> 1, \
00270                                    2 + (((C) & 0x4) >> 2), \
00271                                    2 + (((C) & 0x8) >> 3)); })
00272 
00273 #define _mm_permute_ps(A, C) __extension__ ({ \
00274   __m128 __A = (A); \
00275   (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \
00276                                    (C) & 0x3, ((C) & 0xc) >> 2, \
00277                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
00278 
00279 #define _mm256_permute_ps(A, C) __extension__ ({ \
00280   __m256 __A = (A); \
00281   (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \
00282                                   (C) & 0x3, ((C) & 0xc) >> 2, \
00283                                   ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \
00284                                   4 + (((C) & 0x03) >> 0), \
00285                                   4 + (((C) & 0x0c) >> 2), \
00286                                   4 + (((C) & 0x30) >> 4), \
00287                                   4 + (((C) & 0xc0) >> 6)); })
00288 
00289 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
00290   __m256d __V1 = (V1); \
00291   __m256d __V2 = (V2); \
00292   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); })
00293 
00294 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
00295   __m256 __V1 = (V1); \
00296   __m256 __V2 = (V2); \
00297   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
00298 
00299 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
00300   __m256i __V1 = (V1); \
00301   __m256i __V2 = (V2); \
00302   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); })
00303 
00304 /* Vector Blend */
00305 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
00306   __m256d __V1 = (V1); \
00307   __m256d __V2 = (V2); \
00308   (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); })
00309 
00310 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
00311   __m256 __V1 = (V1); \
00312   __m256 __V2 = (V2); \
00313   (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
00314 
00315 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00316 _mm256_blendv_pd(__m256d a, __m256d b, __m256d c)
00317 {
00318   return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c);
00319 }
00320 
00321 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00322 _mm256_blendv_ps(__m256 a, __m256 b, __m256 c)
00323 {
00324   return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c);
00325 }
00326 
00327 /* Vector Dot Product */
00328 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
00329   __m256 __V1 = (V1); \
00330   __m256 __V2 = (V2); \
00331   (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); })
00332 
00333 /* Vector shuffle */
00334 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
00335         __m256 __a = (a); \
00336         __m256 __b = (b); \
00337         (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \
00338         (mask) & 0x3,                ((mask) & 0xc) >> 2, \
00339         (((mask) & 0x30) >> 4) + 8,  (((mask) & 0xc0) >> 6) + 8, \
00340         ((mask) & 0x3) + 4,          (((mask) & 0xc) >> 2) + 4, \
00341         (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); })
00342 
00343 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
00344         __m256d __a = (a); \
00345         __m256d __b = (b); \
00346         (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \
00347         (mask) & 0x1, \
00348         (((mask) & 0x2) >> 1) + 4, \
00349         (((mask) & 0x4) >> 2) + 2, \
00350         (((mask) & 0x8) >> 3) + 6); })
00351 
00352 /* Compare */
00353 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
00354 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
00355 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
00356 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
00357 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
00358 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
00359 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
00360 #define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
00361 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
00362 #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
00363 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
00364 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
00365 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
00366 #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
00367 #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
00368 #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
00369 #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
00370 #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
00371 #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
00372 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
00373 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
00374 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
00375 #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
00376 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
00377 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
00378 #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
00379 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
00380 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
00381 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
00382 #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
00383 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
00384 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
00385 
00386 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
00387   __m128d __a = (a); \
00388   __m128d __b = (b); \
00389   (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); })
00390 
00391 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
00392   __m128 __a = (a); \
00393   __m128 __b = (b); \
00394   (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); })
00395 
00396 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
00397   __m256d __a = (a); \
00398   __m256d __b = (b); \
00399   (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); })
00400 
00401 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
00402   __m256 __a = (a); \
00403   __m256 __b = (b); \
00404   (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); })
00405 
00406 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
00407   __m128d __a = (a); \
00408   __m128d __b = (b); \
00409   (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); })
00410 
00411 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
00412   __m128 __a = (a); \
00413   __m128 __b = (b); \
00414   (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
00415 
00416 /* Vector extract */
00417 #define _mm256_extractf128_pd(A, O) __extension__ ({ \
00418   __m256d __A = (A); \
00419   (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
00420 
00421 #define _mm256_extractf128_ps(A, O) __extension__ ({ \
00422   __m256 __A = (A); \
00423   (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
00424 
00425 #define _mm256_extractf128_si256(A, O) __extension__ ({ \
00426   __m256i __A = (A); \
00427   (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
00428 
00429 static __inline int __attribute__((__always_inline__, __nodebug__))
00430 _mm256_extract_epi32(__m256i a, int const imm)
00431 {
00432   __v8si b = (__v8si)a;
00433   return b[imm];
00434 }
00435 
00436 static __inline int __attribute__((__always_inline__, __nodebug__))
00437 _mm256_extract_epi16(__m256i a, int const imm)
00438 {
00439   __v16hi b = (__v16hi)a;
00440   return b[imm];
00441 }
00442 
00443 static __inline int __attribute__((__always_inline__, __nodebug__))
00444 _mm256_extract_epi8(__m256i a, int const imm)
00445 {
00446   __v32qi b = (__v32qi)a;
00447   return b[imm];
00448 }
00449 
00450 #ifdef __x86_64__
00451 static __inline long long  __attribute__((__always_inline__, __nodebug__))
00452 _mm256_extract_epi64(__m256i a, const int imm)
00453 {
00454   __v4di b = (__v4di)a;
00455   return b[imm];
00456 }
00457 #endif
00458 
00459 /* Vector insert */
00460 #define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \
00461   __m256d __V1 = (V1); \
00462   __m128d __V2 = (V2); \
00463   (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); })
00464 
00465 #define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \
00466   __m256 __V1 = (V1); \
00467   __m128 __V2 = (V2); \
00468   (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); })
00469 
00470 #define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \
00471   __m256i __V1 = (V1); \
00472   __m128i __V2 = (V2); \
00473   (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); })
00474 
00475 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00476 _mm256_insert_epi32(__m256i a, int b, int const imm)
00477 {
00478   __v8si c = (__v8si)a;
00479   c[imm & 7] = b;
00480   return (__m256i)c;
00481 }
00482 
00483 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00484 _mm256_insert_epi16(__m256i a, int b, int const imm)
00485 {
00486   __v16hi c = (__v16hi)a;
00487   c[imm & 15] = b;
00488   return (__m256i)c;
00489 }
00490 
00491 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00492 _mm256_insert_epi8(__m256i a, int b, int const imm)
00493 {
00494   __v32qi c = (__v32qi)a;
00495   c[imm & 31] = b;
00496   return (__m256i)c;
00497 }
00498 
00499 #ifdef __x86_64__
00500 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00501 _mm256_insert_epi64(__m256i a, int b, int const imm)
00502 {
00503   __v4di c = (__v4di)a;
00504   c[imm & 3] = b;
00505   return (__m256i)c;
00506 }
00507 #endif
00508 
00509 /* Conversion */
00510 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00511 _mm256_cvtepi32_pd(__m128i a)
00512 {
00513   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a);
00514 }
00515 
00516 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00517 _mm256_cvtepi32_ps(__m256i a)
00518 {
00519   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a);
00520 }
00521 
00522 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00523 _mm256_cvtpd_ps(__m256d a)
00524 {
00525   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a);
00526 }
00527 
00528 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00529 _mm256_cvtps_epi32(__m256 a)
00530 {
00531   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a);
00532 }
00533 
00534 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00535 _mm256_cvtps_pd(__m128 a)
00536 {
00537   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a);
00538 }
00539 
00540 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
00541 _mm256_cvttpd_epi32(__m256d a)
00542 {
00543   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a);
00544 }
00545 
00546 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
00547 _mm256_cvtpd_epi32(__m256d a)
00548 {
00549   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a);
00550 }
00551 
00552 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00553 _mm256_cvttps_epi32(__m256 a)
00554 {
00555   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a);
00556 }
00557 
00558 /* Vector replicate */
00559 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00560 _mm256_movehdup_ps(__m256 a)
00561 {
00562   return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7);
00563 }
00564 
00565 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00566 _mm256_moveldup_ps(__m256 a)
00567 {
00568   return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6);
00569 }
00570 
00571 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00572 _mm256_movedup_pd(__m256d a)
00573 {
00574   return __builtin_shufflevector(a, a, 0, 0, 2, 2);
00575 }
00576 
00577 /* Unpack and Interleave */
00578 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00579 _mm256_unpackhi_pd(__m256d a, __m256d b)
00580 {
00581   return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2);
00582 }
00583 
00584 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00585 _mm256_unpacklo_pd(__m256d a, __m256d b)
00586 {
00587   return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2);
00588 }
00589 
00590 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00591 _mm256_unpackhi_ps(__m256 a, __m256 b)
00592 {
00593   return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
00594 }
00595 
00596 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00597 _mm256_unpacklo_ps(__m256 a, __m256 b)
00598 {
00599   return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
00600 }
00601 
00602 /* Bit Test */
00603 static __inline int __attribute__((__always_inline__, __nodebug__))
00604 _mm_testz_pd(__m128d a, __m128d b)
00605 {
00606   return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b);
00607 }
00608 
00609 static __inline int __attribute__((__always_inline__, __nodebug__))
00610 _mm_testc_pd(__m128d a, __m128d b)
00611 {
00612   return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b);
00613 }
00614 
00615 static __inline int __attribute__((__always_inline__, __nodebug__))
00616 _mm_testnzc_pd(__m128d a, __m128d b)
00617 {
00618   return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b);
00619 }
00620 
00621 static __inline int __attribute__((__always_inline__, __nodebug__))
00622 _mm_testz_ps(__m128 a, __m128 b)
00623 {
00624   return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b);
00625 }
00626 
00627 static __inline int __attribute__((__always_inline__, __nodebug__))
00628 _mm_testc_ps(__m128 a, __m128 b)
00629 {
00630   return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b);
00631 }
00632 
00633 static __inline int __attribute__((__always_inline__, __nodebug__))
00634 _mm_testnzc_ps(__m128 a, __m128 b)
00635 {
00636   return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b);
00637 }
00638 
00639 static __inline int __attribute__((__always_inline__, __nodebug__))
00640 _mm256_testz_pd(__m256d a, __m256d b)
00641 {
00642   return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b);
00643 }
00644 
00645 static __inline int __attribute__((__always_inline__, __nodebug__))
00646 _mm256_testc_pd(__m256d a, __m256d b)
00647 {
00648   return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b);
00649 }
00650 
00651 static __inline int __attribute__((__always_inline__, __nodebug__))
00652 _mm256_testnzc_pd(__m256d a, __m256d b)
00653 {
00654   return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b);
00655 }
00656 
00657 static __inline int __attribute__((__always_inline__, __nodebug__))
00658 _mm256_testz_ps(__m256 a, __m256 b)
00659 {
00660   return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b);
00661 }
00662 
00663 static __inline int __attribute__((__always_inline__, __nodebug__))
00664 _mm256_testc_ps(__m256 a, __m256 b)
00665 {
00666   return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b);
00667 }
00668 
00669 static __inline int __attribute__((__always_inline__, __nodebug__))
00670 _mm256_testnzc_ps(__m256 a, __m256 b)
00671 {
00672   return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b);
00673 }
00674 
00675 static __inline int __attribute__((__always_inline__, __nodebug__))
00676 _mm256_testz_si256(__m256i a, __m256i b)
00677 {
00678   return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b);
00679 }
00680 
00681 static __inline int __attribute__((__always_inline__, __nodebug__))
00682 _mm256_testc_si256(__m256i a, __m256i b)
00683 {
00684   return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b);
00685 }
00686 
00687 static __inline int __attribute__((__always_inline__, __nodebug__))
00688 _mm256_testnzc_si256(__m256i a, __m256i b)
00689 {
00690   return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b);
00691 }
00692 
00693 /* Vector extract sign mask */
00694 static __inline int __attribute__((__always_inline__, __nodebug__))
00695 _mm256_movemask_pd(__m256d a)
00696 {
00697   return __builtin_ia32_movmskpd256((__v4df)a);
00698 }
00699 
00700 static __inline int __attribute__((__always_inline__, __nodebug__))
00701 _mm256_movemask_ps(__m256 a)
00702 {
00703   return __builtin_ia32_movmskps256((__v8sf)a);
00704 }
00705 
00706 /* Vector zero */
00707 static __inline void __attribute__((__always_inline__, __nodebug__))
00708 _mm256_zeroall(void)
00709 {
00710   __builtin_ia32_vzeroall();
00711 }
00712 
00713 static __inline void __attribute__((__always_inline__, __nodebug__))
00714 _mm256_zeroupper(void)
00715 {
00716   __builtin_ia32_vzeroupper();
00717 }
00718 
00719 /* Vector load with broadcast */
00720 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00721 _mm_broadcast_ss(float const *a)
00722 {
00723   return (__m128)__builtin_ia32_vbroadcastss(a);
00724 }
00725 
00726 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00727 _mm256_broadcast_sd(double const *a)
00728 {
00729   return (__m256d)__builtin_ia32_vbroadcastsd256(a);
00730 }
00731 
00732 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00733 _mm256_broadcast_ss(float const *a)
00734 {
00735   return (__m256)__builtin_ia32_vbroadcastss256(a);
00736 }
00737 
00738 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00739 _mm256_broadcast_pd(__m128d const *a)
00740 {
00741   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a);
00742 }
00743 
00744 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00745 _mm256_broadcast_ps(__m128 const *a)
00746 {
00747   return (__m256)__builtin_ia32_vbroadcastf128_ps256(a);
00748 }
00749 
00750 /* SIMD load ops */
00751 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00752 _mm256_load_pd(double const *p)
00753 {
00754   return *(__m256d *)p;
00755 }
00756 
00757 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00758 _mm256_load_ps(float const *p)
00759 {
00760   return *(__m256 *)p;
00761 }
00762 
00763 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00764 _mm256_loadu_pd(double const *p)
00765 {
00766   struct __loadu_pd {
00767     __m256d v;
00768   } __attribute__((packed, may_alias));
00769   return ((struct __loadu_pd*)p)->v;
00770 }
00771 
00772 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00773 _mm256_loadu_ps(float const *p)
00774 {
00775   struct __loadu_ps {
00776     __m256 v;
00777   } __attribute__((packed, may_alias));
00778   return ((struct __loadu_ps*)p)->v;
00779 }
00780 
00781 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00782 _mm256_load_si256(__m256i const *p)
00783 {
00784   return *p;
00785 }
00786 
00787 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00788 _mm256_loadu_si256(__m256i const *p)
00789 {
00790   struct __loadu_si256 {
00791     __m256i v;
00792   } __attribute__((packed, may_alias));
00793   return ((struct __loadu_si256*)p)->v;
00794 }
00795 
00796 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00797 _mm256_lddqu_si256(__m256i const *p)
00798 {
00799   return (__m256i)__builtin_ia32_lddqu256((char const *)p);
00800 }
00801 
00802 /* SIMD store ops */
00803 static __inline void __attribute__((__always_inline__, __nodebug__))
00804 _mm256_store_pd(double *p, __m256d a)
00805 {
00806   *(__m256d *)p = a;
00807 }
00808 
00809 static __inline void __attribute__((__always_inline__, __nodebug__))
00810 _mm256_store_ps(float *p, __m256 a)
00811 {
00812   *(__m256 *)p = a;
00813 }
00814 
00815 static __inline void __attribute__((__always_inline__, __nodebug__))
00816 _mm256_storeu_pd(double *p, __m256d a)
00817 {
00818   __builtin_ia32_storeupd256(p, (__v4df)a);
00819 }
00820 
00821 static __inline void __attribute__((__always_inline__, __nodebug__))
00822 _mm256_storeu_ps(float *p, __m256 a)
00823 {
00824   __builtin_ia32_storeups256(p, (__v8sf)a);
00825 }
00826 
00827 static __inline void __attribute__((__always_inline__, __nodebug__))
00828 _mm256_store_si256(__m256i *p, __m256i a)
00829 {
00830   *p = a;
00831 }
00832 
00833 static __inline void __attribute__((__always_inline__, __nodebug__))
00834 _mm256_storeu_si256(__m256i *p, __m256i a)
00835 {
00836   __builtin_ia32_storedqu256((char *)p, (__v32qi)a);
00837 }
00838 
00839 /* Conditional load ops */
00840 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
00841 _mm_maskload_pd(double const *p, __m128d m)
00842 {
00843   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m);
00844 }
00845 
00846 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00847 _mm256_maskload_pd(double const *p, __m256d m)
00848 {
00849   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m);
00850 }
00851 
00852 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
00853 _mm_maskload_ps(float const *p, __m128 m)
00854 {
00855   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m);
00856 }
00857 
00858 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00859 _mm256_maskload_ps(float const *p, __m256 m)
00860 {
00861   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m);
00862 }
00863 
00864 /* Conditional store ops */
00865 static __inline void __attribute__((__always_inline__, __nodebug__))
00866 _mm256_maskstore_ps(float *p, __m256 m, __m256 a)
00867 {
00868   __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a);
00869 }
00870 
00871 static __inline void __attribute__((__always_inline__, __nodebug__))
00872 _mm_maskstore_pd(double *p, __m128d m, __m128d a)
00873 {
00874   __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a);
00875 }
00876 
00877 static __inline void __attribute__((__always_inline__, __nodebug__))
00878 _mm256_maskstore_pd(double *p, __m256d m, __m256d a)
00879 {
00880   __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a);
00881 }
00882 
00883 static __inline void __attribute__((__always_inline__, __nodebug__))
00884 _mm_maskstore_ps(float *p, __m128 m, __m128 a)
00885 {
00886   __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a);
00887 }
00888 
00889 /* Cacheability support ops */
00890 static __inline void __attribute__((__always_inline__, __nodebug__))
00891 _mm256_stream_si256(__m256i *a, __m256i b)
00892 {
00893   __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b);
00894 }
00895 
00896 static __inline void __attribute__((__always_inline__, __nodebug__))
00897 _mm256_stream_pd(double *a, __m256d b)
00898 {
00899   __builtin_ia32_movntpd256(a, (__v4df)b);
00900 }
00901 
00902 static __inline void __attribute__((__always_inline__, __nodebug__))
00903 _mm256_stream_ps(float *p, __m256 a)
00904 {
00905   __builtin_ia32_movntps256(p, (__v8sf)a);
00906 }
00907 
00908 /* Create vectors */
00909 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00910 _mm256_set_pd(double a, double b, double c, double d)
00911 {
00912   return (__m256d){ d, c, b, a };
00913 }
00914 
00915 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00916 _mm256_set_ps(float a, float b, float c, float d,
00917               float e, float f, float g, float h)
00918 {
00919   return (__m256){ h, g, f, e, d, c, b, a };
00920 }
00921 
00922 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00923 _mm256_set_epi32(int i0, int i1, int i2, int i3,
00924                  int i4, int i5, int i6, int i7)
00925 {
00926   return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 };
00927 }
00928 
00929 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00930 _mm256_set_epi16(short w15, short w14, short w13, short w12,
00931                  short w11, short w10, short w09, short w08,
00932                  short w07, short w06, short w05, short w04,
00933                  short w03, short w02, short w01, short w00)
00934 {
00935   return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07,
00936                              w08, w09, w10, w11, w12, w13, w14, w15 };
00937 }
00938 
00939 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00940 _mm256_set_epi8(char b31, char b30, char b29, char b28,
00941                 char b27, char b26, char b25, char b24,
00942                 char b23, char b22, char b21, char b20,
00943                 char b19, char b18, char b17, char b16,
00944                 char b15, char b14, char b13, char b12,
00945                 char b11, char b10, char b09, char b08,
00946                 char b07, char b06, char b05, char b04,
00947                 char b03, char b02, char b01, char b00)
00948 {
00949   return (__m256i)(__v32qi){
00950     b00, b01, b02, b03, b04, b05, b06, b07,
00951     b08, b09, b10, b11, b12, b13, b14, b15,
00952     b16, b17, b18, b19, b20, b21, b22, b23,
00953     b24, b25, b26, b27, b28, b29, b30, b31
00954   };
00955 }
00956 
00957 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00958 _mm256_set_epi64x(long long a, long long b, long long c, long long d)
00959 {
00960   return (__m256i)(__v4di){ d, c, b, a };
00961 }
00962 
00963 /* Create vectors with elements in reverse order */
00964 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
00965 _mm256_setr_pd(double a, double b, double c, double d)
00966 {
00967   return (__m256d){ a, b, c, d };
00968 }
00969 
00970 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
00971 _mm256_setr_ps(float a, float b, float c, float d,
00972                float e, float f, float g, float h)
00973 {
00974   return (__m256){ a, b, c, d, e, f, g, h };
00975 }
00976 
00977 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00978 _mm256_setr_epi32(int i0, int i1, int i2, int i3,
00979                   int i4, int i5, int i6, int i7)
00980 {
00981   return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 };
00982 }
00983 
00984 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00985 _mm256_setr_epi16(short w15, short w14, short w13, short w12,
00986        short w11, short w10, short w09, short w08,
00987        short w07, short w06, short w05, short w04,
00988        short w03, short w02, short w01, short w00)
00989 {
00990   return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08,
00991                              w07, w06, w05, w04, w03, w02, w01, w00 };
00992 }
00993 
00994 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
00995 _mm256_setr_epi8(char b31, char b30, char b29, char b28,
00996                  char b27, char b26, char b25, char b24,
00997                  char b23, char b22, char b21, char b20,
00998                  char b19, char b18, char b17, char b16,
00999                  char b15, char b14, char b13, char b12,
01000                  char b11, char b10, char b09, char b08,
01001                  char b07, char b06, char b05, char b04,
01002                  char b03, char b02, char b01, char b00)
01003 {
01004   return (__m256i)(__v32qi){
01005     b31, b30, b29, b28, b27, b26, b25, b24,
01006     b23, b22, b21, b20, b19, b18, b17, b16,
01007     b15, b14, b13, b12, b11, b10, b09, b08,
01008     b07, b06, b05, b04, b03, b02, b01, b00 };
01009 }
01010 
01011 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01012 _mm256_setr_epi64x(long long a, long long b, long long c, long long d)
01013 {
01014   return (__m256i)(__v4di){ a, b, c, d };
01015 }
01016 
01017 /* Create vectors with repeated elements */
01018 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01019 _mm256_set1_pd(double w)
01020 {
01021   return (__m256d){ w, w, w, w };
01022 }
01023 
01024 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01025 _mm256_set1_ps(float w)
01026 {
01027   return (__m256){ w, w, w, w, w, w, w, w };
01028 }
01029 
01030 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01031 _mm256_set1_epi32(int i)
01032 {
01033   return (__m256i)(__v8si){ i, i, i, i, i, i, i, i };
01034 }
01035 
01036 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01037 _mm256_set1_epi16(short w)
01038 {
01039   return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w };
01040 }
01041 
01042 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01043 _mm256_set1_epi8(char b)
01044 {
01045   return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b,
01046                              b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
01047 }
01048 
01049 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01050 _mm256_set1_epi64x(long long q)
01051 {
01052   return (__m256i)(__v4di){ q, q, q, q };
01053 }
01054 
01055 /* Create zeroed vectors */
01056 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01057 _mm256_setzero_pd(void)
01058 {
01059   return (__m256d){ 0, 0, 0, 0 };
01060 }
01061 
01062 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01063 _mm256_setzero_ps(void)
01064 {
01065   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
01066 }
01067 
01068 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01069 _mm256_setzero_si256(void)
01070 {
01071   return (__m256i){ 0LL, 0LL, 0LL, 0LL };
01072 }
01073 
01074 /* Cast between vector types */
01075 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01076 _mm256_castpd_ps(__m256d in)
01077 {
01078   return (__m256)in;
01079 }
01080 
01081 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01082 _mm256_castpd_si256(__m256d in)
01083 {
01084   return (__m256i)in;
01085 }
01086 
01087 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01088 _mm256_castps_pd(__m256 in)
01089 {
01090   return (__m256d)in;
01091 }
01092 
01093 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01094 _mm256_castps_si256(__m256 in)
01095 {
01096   return (__m256i)in;
01097 }
01098 
01099 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01100 _mm256_castsi256_ps(__m256i in)
01101 {
01102   return (__m256)in;
01103 }
01104 
01105 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01106 _mm256_castsi256_pd(__m256i in)
01107 {
01108   return (__m256d)in;
01109 }
01110 
01111 static __inline __m128d __attribute__((__always_inline__, __nodebug__))
01112 _mm256_castpd256_pd128(__m256d in)
01113 {
01114   return __builtin_shufflevector(in, in, 0, 1);
01115 }
01116 
01117 static __inline __m128 __attribute__((__always_inline__, __nodebug__))
01118 _mm256_castps256_ps128(__m256 in)
01119 {
01120   return __builtin_shufflevector(in, in, 0, 1, 2, 3);
01121 }
01122 
01123 static __inline __m128i __attribute__((__always_inline__, __nodebug__))
01124 _mm256_castsi256_si128(__m256i in)
01125 {
01126   return __builtin_shufflevector(in, in, 0, 1);
01127 }
01128 
01129 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01130 _mm256_castpd128_pd256(__m128d in)
01131 {
01132   __m128d zero = _mm_setzero_pd();
01133   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
01134 }
01135 
01136 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01137 _mm256_castps128_ps256(__m128 in)
01138 {
01139   __m128 zero = _mm_setzero_ps();
01140   return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4);
01141 }
01142 
01143 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01144 _mm256_castsi128_si256(__m128i in)
01145 {
01146   __m128i zero = _mm_setzero_si128();
01147   return __builtin_shufflevector(in, zero, 0, 1, 2, 2);
01148 }
01149 
01150 /* SIMD load ops (unaligned) */
01151 static __inline __m256 __attribute__((__always_inline__, __nodebug__))
01152 _mm256_loadu2_m128(float const *addr_hi, float const *addr_lo)
01153 {
01154   struct __loadu_ps {
01155     __m128 v;
01156   } __attribute__((__packed__, __may_alias__));
01157 
01158   __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v);
01159   return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1);
01160 }
01161 
01162 static __inline __m256d __attribute__((__always_inline__, __nodebug__))
01163 _mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo)
01164 {
01165   struct __loadu_pd {
01166     __m128d v;
01167   } __attribute__((__packed__, __may_alias__));
01168   
01169   __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v);
01170   return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1);
01171 }
01172 
01173 static __inline __m256i __attribute__((__always_inline__, __nodebug__))
01174 _mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo)
01175 {
01176   struct __loadu_si128 {
01177     __m128i v;
01178   } __attribute__((packed, may_alias));
01179   __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v);
01180   return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1);
01181 }
01182 
01183 /* SIMD store ops (unaligned) */
01184 static __inline void __attribute__((__always_inline__, __nodebug__))
01185 _mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a)
01186 {
01187   __m128 v128;
01188 
01189   v128 = _mm256_castps256_ps128(a);
01190   __builtin_ia32_storeups(addr_lo, v128);
01191   v128 = _mm256_extractf128_ps(a, 1);
01192   __builtin_ia32_storeups(addr_hi, v128);
01193 }
01194 
01195 static __inline void __attribute__((__always_inline__, __nodebug__))
01196 _mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a)
01197 {
01198   __m128d v128;
01199 
01200   v128 = _mm256_castpd256_pd128(a);
01201   __builtin_ia32_storeupd(addr_lo, v128);
01202   v128 = _mm256_extractf128_pd(a, 1);
01203   __builtin_ia32_storeupd(addr_hi, v128);
01204 }
01205 
01206 static __inline void __attribute__((__always_inline__, __nodebug__))
01207 _mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a)
01208 {
01209   __m128i v128;
01210 
01211   v128 = _mm256_castsi256_si128(a);
01212   __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128);
01213   v128 = _mm256_extractf128_si256(a, 1);
01214   __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128);
01215 }