doxygen/ppc__wrappers_2xmmintrin_8h_source.html

/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===

 *

 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 * See https://llvm.org/LICENSE.txt for license information.

 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 *

 *===-----------------------------------------------------------------------===

 */


/* Implemented from the specification included in the Intel C++ Compiler

   User Guide and Reference, version 9.0.  */


#ifndef NO_WARN_X86_INTRINSICS

/* This header file is to help porting code using Intel intrinsics

   explicitly from x86_64 to powerpc64/powerpc64le.


   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC

   VMX/VSX ISA is a good match for vector float SIMD operations.

   However scalar float operations in vector (XMM) registers require

   the POWER8 VSX ISA (2.07) level. There are differences for data

   format and placement of float scalars in the vector register, which

   require extra steps to match SSE scalar float semantics on POWER.


   It should be noted that there's much difference between X86_64's

   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use

   portable <fenv.h> instead of access MXSCR directly.


   Most SSE scalar float intrinsic operations can be performed more

   efficiently as C language float scalar operations or optimized to

   use vector SIMD operations. We recommend this for new applications. */

#error                                                                         \

    "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."

#endif


#ifndef XMMINTRIN_H_

#define XMMINTRIN_H_


#if defined(__powerpc64__) &&                                                  \

    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))


/* Define four value permute mask */

#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))


#include <altivec.h>


/* Avoid collisions between altivec.h and strict adherence to C++ and

   C11 standards.  This should eventually be done inside altivec.h itself,

   but only after testing a full distro build.  */

#if defined(__STRICT_ANSI__) &&                                                \

    (defined(__cplusplus) ||                                                   \

     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))

#undef vector

#undef pixel

#undef bool

#endif


/* We need type definitions from the MMX header file.  */

#include <mmintrin.h>


/* Get _mm_malloc () and _mm_free ().  */

#if __STDC_HOSTED__

#include <mm_malloc.h>

#endif


/* The Intel API is flexible enough that we must allow aliasing with other

   vector types, and their scalar components.  */

typedef vector float __m128 __attribute__((__may_alias__));


/* Unaligned version of the same type.  */

typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));


/* Internal data types for implementing the intrinsics.  */

typedef vector float __v4sf;


/* Create an undefined vector.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_undefined_ps(void) {

  __m128 __Y = __Y;

  return __Y;

}


/* Create a vector of zeros.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_setzero_ps(void) {

  return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};

}


/* Load four SPFP values from P.  The address must be 16-byte aligned.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_load_ps(float const *__P) {

  return ((__m128)vec_ld(0, (__v4sf *)__P));

}


/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_loadu_ps(float const *__P) {

  return (vec_vsx_ld(0, __P));

}


/* Load four SPFP values in reverse order.  The address must be aligned.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_loadr_ps(float const *__P) {

  __v4sf __tmp;

  __m128 __result;

  static const __vector unsigned char __permute_vector = {

      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,

      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};


  __tmp = vec_ld(0, (__v4sf *)__P);

  __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);

  return __result;

}


/* Create a vector with all four elements equal to F.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_set1_ps(float __F) {

  return __extension__(__m128)(__v4sf){__F, __F, __F, __F};

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_set_ps1(float __F) {

  return _mm_set1_ps(__F);

}


/* Create the vector [Z Y X W].  */

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,

                                      __artificial__))

_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {

  return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};

}


/* Create the vector [W X Y Z].  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_setr_ps(float __Z, float __Y, float __X, float __W) {

  return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};

}


/* Store four SPFP values.  The address must be 16-byte aligned.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_store_ps(float *__P, __m128 __A) {

  vec_st((__v4sf)__A, 0, (__v4sf *)__P);

}


/* Store four SPFP values.  The address need not be 16-byte aligned.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_storeu_ps(float *__P, __m128 __A) {

  *(__m128_u *)__P = __A;

}


/* Store four SPFP values in reverse order.  The address must be aligned.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_storer_ps(float *__P, __m128 __A) {

  __v4sf __tmp;

  static const __vector unsigned char __permute_vector = {

      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,

      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};


  __tmp = (__m128)vec_perm(__A, __A, __permute_vector);


  _mm_store_ps(__P, __tmp);

}


/* Store the lower SPFP value across four words.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_store1_ps(float *__P, __m128 __A) {

  __v4sf __va = vec_splat((__v4sf)__A, 0);

  _mm_store_ps(__P, __va);

}


extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_store_ps1(float *__P, __m128 __A) {

  _mm_store1_ps(__P, __A);

}


/* Create a vector with element 0 as F and the rest zero.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_set_ss(float __F) {

  return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};

}


/* Sets the low SPFP value of A from the low value of B.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_move_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


  return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));

}


/* Create a vector with element 0 as *P and the rest zero.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_load_ss(float const *__P) {

  return _mm_set_ss(*__P);

}


/* Stores the lower SPFP value.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_store_ss(float *__P, __m128 __A) {

  *__P = ((__v4sf)__A)[0];

}


/* Perform the respective operation on the lower SPFP (single-precision

   floating-point) values of A and B; the upper three SPFP values are

   passed through from A.  */


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_add_ss(__m128 __A, __m128 __B) {

#ifdef _ARCH_PWR7

  __m128 __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  __a = vec_splat(__A, 0);

  __b = vec_splat(__B, 0);

  __c = __a + __b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

#else

  __A[0] = __A[0] + __B[0];

  return (__A);

#endif

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sub_ss(__m128 __A, __m128 __B) {

#ifdef _ARCH_PWR7

  __m128 __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  __a = vec_splat(__A, 0);

  __b = vec_splat(__B, 0);

  __c = __a - __b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

#else

  __A[0] = __A[0] - __B[0];

  return (__A);

#endif

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_mul_ss(__m128 __A, __m128 __B) {

#ifdef _ARCH_PWR7

  __m128 __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  __a = vec_splat(__A, 0);

  __b = vec_splat(__B, 0);

  __c = __a * __b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

#else

  __A[0] = __A[0] * __B[0];

  return (__A);

#endif

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_div_ss(__m128 __A, __m128 __B) {

#ifdef _ARCH_PWR7

  __m128 __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

     results. So to insure we don't generate spurious exceptions

     (from the upper double values) we splat the lower double

     before we to the operation.  */

  __a = vec_splat(__A, 0);

  __b = vec_splat(__B, 0);

  __c = __a / __b;

  /* Then we merge the lower float result with the original upper

     float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

#else

  __A[0] = __A[0] / __B[0];

  return (__A);

#endif

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sqrt_ss(__m128 __A) {

  __m128 __a, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  __a = vec_splat(__A, 0);

  __c = vec_sqrt(__a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

}


/* Perform the respective operation on the four SPFP values in A and B.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_add_ps(__m128 __A, __m128 __B) {

  return (__m128)((__v4sf)__A + (__v4sf)__B);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sub_ps(__m128 __A, __m128 __B) {

  return (__m128)((__v4sf)__A - (__v4sf)__B);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_mul_ps(__m128 __A, __m128 __B) {

  return (__m128)((__v4sf)__A * (__v4sf)__B);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_div_ps(__m128 __A, __m128 __B) {

  return (__m128)((__v4sf)__A / (__v4sf)__B);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sqrt_ps(__m128 __A) {

  return (vec_sqrt((__v4sf)__A));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_rcp_ps(__m128 __A) {

  return (vec_re((__v4sf)__A));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_rsqrt_ps(__m128 __A) {

  return (vec_rsqrte(__A));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_rcp_ss(__m128 __A) {

  __m128 __a, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  __a = vec_splat(__A, 0);

  __c = _mm_rcp_ps(__a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_rsqrt_ss(__m128 __A) {

  __m128 __a, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower double)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper double values) we splat the lower double

   * before we to the operation. */

  __a = vec_splat(__A, 0);

  __c = vec_rsqrte(__a);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel(__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_min_ss(__m128 __A, __m128 __B) {

  __v4sf __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower float)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper float values) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = vec_min(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_max_ss(__m128 __A, __m128 __B) {

  __v4sf __a, __b, __c;

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  /* PowerISA VSX does not allow partial (for just lower float)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper float values) we splat the lower float

   * before we to the operation. */

  __a = vec_splat(__A, 0);

  __b = vec_splat(__B, 0);

  __c = vec_max(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return (vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_min_ps(__m128 __A, __m128 __B) {

  __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);

  return vec_sel(__B, __A, __m);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_max_ps(__m128 __A, __m128 __B) {

  __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);

  return vec_sel(__B, __A, __m);

}


/* Perform logical bit-wise operations on 128-bit values.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_and_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));

  //  return __builtin_ia32_andps (__A, __B);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_andnot_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_or_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_xor_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));

}


/* Perform a comparison on the four SPFP values of A and B.  For each

   element, if the comparison is true, place a mask of all ones in the

   result, otherwise a mask of zeros.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpeq_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmplt_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmple_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpgt_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpge_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpneq_ps(__m128 __A, __m128 __B) {

  __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);

  return ((__m128)vec_nor(__temp, __temp));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnlt_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnle_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpngt_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnge_ps(__m128 __A, __m128 __B) {

  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpord_ps(__m128 __A, __m128 __B) {

  __vector unsigned int __a, __b;

  __vector unsigned int __c, __d;

  static const __vector unsigned int __float_exp_mask = {

      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};


  __a = (__vector unsigned int)vec_abs((__v4sf)__A);

  __b = (__vector unsigned int)vec_abs((__v4sf)__B);

  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);

  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);

  return ((__m128)vec_and(__c, __d));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpunord_ps(__m128 __A, __m128 __B) {

  __vector unsigned int __a, __b;

  __vector unsigned int __c, __d;

  static const __vector unsigned int __float_exp_mask = {

      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};


  __a = (__vector unsigned int)vec_abs((__v4sf)__A);

  __b = (__vector unsigned int)vec_abs((__v4sf)__B);

  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);

  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);

  return ((__m128)vec_or(__c, __d));

}


/* Perform a comparison on the lower SPFP values of A and B.  If the

   comparison is true, place a mask of all ones in the result, otherwise a

   mask of zeros.  The upper three SPFP values are passed through from A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpeq_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpeq(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmplt_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmplt(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmple_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmple(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpgt_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpgt(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpge_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpge(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpneq_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpeq(__a, __b);

  __c = vec_nor(__c, __c);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnlt_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpge(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnle_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmpgt(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpngt_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we to the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmple(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpnge_ss(__m128 __A, __m128 __B) {

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};

  __v4sf __a, __b, __c;

  /* PowerISA VMX does not allow partial (for just element 0)

   * results. So to insure we don't generate spurious exceptions

   * (from the upper elements) we splat the lower float

   * before we do the operation. */

  __a = vec_splat((__v4sf)__A, 0);

  __b = vec_splat((__v4sf)__B, 0);

  __c = (__v4sf)vec_cmplt(__a, __b);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpord_ss(__m128 __A, __m128 __B) {

  __vector unsigned int __a, __b;

  __vector unsigned int __c, __d;

  static const __vector unsigned int __float_exp_mask = {

      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


  __a = (__vector unsigned int)vec_abs((__v4sf)__A);

  __b = (__vector unsigned int)vec_abs((__v4sf)__B);

  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);

  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);

  __c = vec_and(__c, __d);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cmpunord_ss(__m128 __A, __m128 __B) {

  __vector unsigned int __a, __b;

  __vector unsigned int __c, __d;

  static const __vector unsigned int __float_exp_mask = {

      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};

  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};


  __a = (__vector unsigned int)vec_abs((__v4sf)__A);

  __b = (__vector unsigned int)vec_abs((__v4sf)__B);

  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);

  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);

  __c = vec_or(__c, __d);

  /* Then we merge the lower float result with the original upper

   * float elements from __A.  */

  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));

}


/* Compare the lower SPFP values of A and B and return 1 if true

   and 0 if false.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comieq_ss(__m128 __A, __m128 __B) {

  return (__A[0] == __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comilt_ss(__m128 __A, __m128 __B) {

  return (__A[0] < __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comile_ss(__m128 __A, __m128 __B) {

  return (__A[0] <= __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comigt_ss(__m128 __A, __m128 __B) {

  return (__A[0] > __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comige_ss(__m128 __A, __m128 __B) {

  return (__A[0] >= __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_comineq_ss(__m128 __A, __m128 __B) {

  return (__A[0] != __B[0]);

}


/* FIXME

 * The __mm_ucomi??_ss implementations below are exactly the same as

 * __mm_comi??_ss because GCC for PowerPC only generates unordered

 * compares (scalar and vector).

 * Technically __mm_comieq_ss et al should be using the ordered

 * compare and signal for QNaNs.

 * The __mm_ucomieq_sd et all should be OK, as is.

 */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomieq_ss(__m128 __A, __m128 __B) {

  return (__A[0] == __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomilt_ss(__m128 __A, __m128 __B) {

  return (__A[0] < __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomile_ss(__m128 __A, __m128 __B) {

  return (__A[0] <= __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomigt_ss(__m128 __A, __m128 __B) {

  return (__A[0] > __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomige_ss(__m128 __A, __m128 __B) {

  return (__A[0] >= __B[0]);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_ucomineq_ss(__m128 __A, __m128 __B) {

  return (__A[0] != __B[0]);

}


extern __inline float

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtss_f32(__m128 __A) {

  return ((__v4sf)__A)[0];

}


/* Convert the lower SPFP value to a 32-bit integer according to the current

   rounding mode.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtss_si32(__m128 __A) {

  int __res;

#ifdef _ARCH_PWR8

  double __dtmp;

  __asm__(

#ifdef __LITTLE_ENDIAN__

      "xxsldwi %x0,%x0,%x0,3;\n"

#endif

      "xscvspdp %x2,%x0;\n"

      "fctiw  %2,%2;\n"

      "mfvsrd  %1,%x2;\n"

      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)

      :);

#else

  __res = __builtin_rint(__A[0]);

#endif

  return __res;

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvt_ss2si(__m128 __A) {

  return _mm_cvtss_si32(__A);

}


/* Convert the lower SPFP value to a 32-bit integer according to the

   current rounding mode.  */


/* Intel intrinsic.  */

extern __inline long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtss_si64(__m128 __A) {

  long long __res;

#if defined(_ARCH_PWR8) && defined(__powerpc64__)

  double __dtmp;

  __asm__(

#ifdef __LITTLE_ENDIAN__

      "xxsldwi %x0,%x0,%x0,3;\n"

#endif

      "xscvspdp %x2,%x0;\n"

      "fctid  %2,%2;\n"

      "mfvsrd  %1,%x2;\n"

      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)

      :);

#else

  __res = __builtin_llrint(__A[0]);

#endif

  return __res;

}


/* Microsoft intrinsic.  */

extern __inline long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtss_si64x(__m128 __A) {

  return _mm_cvtss_si64((__v4sf)__A);

}


/* Constants for use with _mm_prefetch.  */

enum _mm_hint {

  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */

  _MM_HINT_ET0 = 7,

  _MM_HINT_ET1 = 6,

  _MM_HINT_T0 = 3,

  _MM_HINT_T1 = 2,

  _MM_HINT_T2 = 1,

  _MM_HINT_NTA = 0

};


/* Loads one cache line from address P to a location "closer" to the

   processor.  The selector I specifies the type of prefetch operation.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_prefetch(const void *__P, enum _mm_hint __I) {

  /* Current PowerPC will ignores the hint parameters.  */

  __builtin_prefetch(__P);

}


/* Convert the two lower SPFP values to 32-bit integers according to the

   current rounding mode.  Return the integers in packed form.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtps_pi32(__m128 __A) {

  /* Splat two lower SPFP values to both halves.  */

  __v4sf __temp, __rounded;

  __vector unsigned long long __result;


  /* Splat two lower SPFP values to both halves.  */

  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);

  __rounded = vec_rint(__temp);

  __result = (__vector unsigned long long)vec_cts(__rounded, 0);


  return (__m64)((__vector long long)__result)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvt_ps2pi(__m128 __A) {

  return _mm_cvtps_pi32(__A);

}


/* Truncate the lower SPFP value to a 32-bit integer.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvttss_si32(__m128 __A) {

  /* Extract the lower float element.  */

  float __temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return __temp;

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtt_ss2si(__m128 __A) {

  return _mm_cvttss_si32(__A);

}


/* Intel intrinsic.  */

extern __inline long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvttss_si64(__m128 __A) {

  /* Extract the lower float element.  */

  float __temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return __temp;

}


/* Microsoft intrinsic.  */

extern __inline long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvttss_si64x(__m128 __A) {

  /* Extract the lower float element.  */

  float __temp = __A[0];

  /* truncate to 32-bit integer and return.  */

  return __temp;

}


/* Truncate the two lower SPFP values to 32-bit integers.  Return the

   integers in packed form.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvttps_pi32(__m128 __A) {

  __v4sf __temp;

  __vector unsigned long long __result;


  /* Splat two lower SPFP values to both halves.  */

  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);

  __result = (__vector unsigned long long)vec_cts(__temp, 0);


  return (__m64)((__vector long long)__result)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtt_ps2pi(__m128 __A) {

  return _mm_cvttps_pi32(__A);

}


/* Convert B to a SPFP value and insert it as element zero in A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtsi32_ss(__m128 __A, int __B) {

  float __temp = __B;

  __A[0] = __temp;


  return __A;

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvt_si2ss(__m128 __A, int __B) {

  return _mm_cvtsi32_ss(__A, __B);

}


/* Convert B to a SPFP value and insert it as element zero in A.  */

/* Intel intrinsic.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtsi64_ss(__m128 __A, long long __B) {

  float __temp = __B;

  __A[0] = __temp;


  return __A;

}


/* Microsoft intrinsic.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtsi64x_ss(__m128 __A, long long __B) {

  return _mm_cvtsi64_ss(__A, __B);

}


/* Convert the two 32-bit values in B to SPFP form and insert them

   as the two lower elements in A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtpi32_ps(__m128 __A, __m64 __B) {

  __vector signed int __vm1;

  __vector float __vf1;


  __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};

  __vf1 = (__vector float)vec_ctf(__vm1, 0);


  return ((__m128)(__vector unsigned long long){

      ((__vector unsigned long long)__vf1)[0],

      ((__vector unsigned long long)__A)[1]});

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvt_pi2ps(__m128 __A, __m64 __B) {

  return _mm_cvtpi32_ps(__A, __B);

}


/* Convert the four signed 16-bit values in A to SPFP form.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtpi16_ps(__m64 __A) {

  __vector signed short __vs8;

  __vector signed int __vi4;

  __vector float __vf1;


  __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};

  __vi4 = vec_vupklsh(__vs8);

  __vf1 = (__vector float)vec_ctf(__vi4, 0);


  return (__m128)__vf1;

}


/* Convert the four unsigned 16-bit values in A to SPFP form.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtpu16_ps(__m64 __A) {

  const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};

  __vector unsigned short __vs8;

  __vector unsigned int __vi4;

  __vector float __vf1;


  __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};

  __vi4 = (__vector unsigned int)vec_mergel

#ifdef __LITTLE_ENDIAN__

      (__vs8, __zero);

#else

      (__zero, __vs8);

#endif

  __vf1 = (__vector float)vec_ctf(__vi4, 0);


  return (__m128)__vf1;

}


/* Convert the low four signed 8-bit values in A to SPFP form.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtpi8_ps(__m64 __A) {

  __vector signed char __vc16;

  __vector signed short __vs8;

  __vector signed int __vi4;

  __vector float __vf1;


  __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};

  __vs8 = vec_vupkhsb(__vc16);

  __vi4 = vec_vupkhsh(__vs8);

  __vf1 = (__vector float)vec_ctf(__vi4, 0);


  return (__m128)__vf1;

}


/* Convert the low four unsigned 8-bit values in A to SPFP form.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))


    _mm_cvtpu8_ps(__m64 __A) {

  const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};

  __vector unsigned char __vc16;

  __vector unsigned short __vs8;

  __vector unsigned int __vi4;

  __vector float __vf1;


  __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};

#ifdef __LITTLE_ENDIAN__

  __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);

  __vi4 =

      (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);

#else

  __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);

  __vi4 =

      (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);

#endif

  __vf1 = (__vector float)vec_ctf(__vi4, 0);


  return (__m128)__vf1;

}


/* Convert the four signed 32-bit values in A and B to SPFP form.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {

  __vector signed int __vi4;

  __vector float __vf4;


  __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};

  __vf4 = (__vector float)vec_ctf(__vi4, 0);

  return (__m128)__vf4;

}


/* Convert the four SPFP values in A to four signed 16-bit integers.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtps_pi16(__m128 __A) {

  __v4sf __rounded;

  __vector signed int __temp;

  __vector unsigned long long __result;


  __rounded = vec_rint(__A);

  __temp = vec_cts(__rounded, 0);

  __result = (__vector unsigned long long)vec_pack(__temp, __temp);


  return (__m64)((__vector long long)__result)[0];

}


/* Convert the four SPFP values in A to four signed 8-bit integers.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_cvtps_pi8(__m128 __A) {

  __v4sf __rounded;

  __vector signed int __tmp_i;

  static const __vector signed int __zero = {0, 0, 0, 0};

  __vector signed short __tmp_s;

  __vector signed char __res_v;


  __rounded = vec_rint(__A);

  __tmp_i = vec_cts(__rounded, 0);

  __tmp_s = vec_pack(__tmp_i, __zero);

  __res_v = vec_pack(__tmp_s, __tmp_s);

  return (__m64)((__vector long long)__res_v)[0];

}


/* Selects four specific SPFP values from A and B based on MASK.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))


    _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {

  unsigned long __element_selector_10 = __mask & 0x03;

  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;

  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;

  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;

  static const unsigned int __permute_selectors[4] = {

#ifdef __LITTLE_ENDIAN__

      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C

#else

      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F

#endif

  };

  __vector unsigned int __t;


  __t[0] = __permute_selectors[__element_selector_10];

  __t[1] = __permute_selectors[__element_selector_32];

  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;

  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;

  return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);

}


/* Selects and interleaves the upper two SPFP values from A and B.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_unpackhi_ps(__m128 __A, __m128 __B) {

  return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);

}


/* Selects and interleaves the lower two SPFP values from A and B.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_unpacklo_ps(__m128 __A, __m128 __B) {

  return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);

}


/* Sets the upper two SPFP values with 64-bits of data loaded from P;

   the lower two values are passed through from A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_loadh_pi(__m128 __A, __m64 const *__P) {

  __vector unsigned long long __a = (__vector unsigned long long)__A;

  __vector unsigned long long __p = vec_splats(*__P);

  __a[1] = __p[1];


  return (__m128)__a;

}


/* Stores the upper two SPFP values of A into P.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_storeh_pi(__m64 *__P, __m128 __A) {

  __vector unsigned long long __a = (__vector unsigned long long)__A;


  *__P = __a[1];

}


/* Moves the upper two values of B into the lower two values of A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_movehl_ps(__m128 __A, __m128 __B) {

  return (__m128)vec_mergel((__vector unsigned long long)__B,

                            (__vector unsigned long long)__A);

}


/* Moves the lower two values of B into the upper two values of A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_movelh_ps(__m128 __A, __m128 __B) {

  return (__m128)vec_mergeh((__vector unsigned long long)__A,

                            (__vector unsigned long long)__B);

}


/* Sets the lower two SPFP values with 64-bits of data loaded from P;

   the upper two values are passed through from A.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_loadl_pi(__m128 __A, __m64 const *__P) {

  __vector unsigned long long __a = (__vector unsigned long long)__A;

  __vector unsigned long long __p = vec_splats(*__P);

  __a[0] = __p[0];


  return (__m128)__a;

}


/* Stores the lower two SPFP values of A into P.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_storel_pi(__m64 *__P, __m128 __A) {

  __vector unsigned long long __a = (__vector unsigned long long)__A;


  *__P = __a[0];

}


#ifdef _ARCH_PWR8

/* Intrinsic functions that require PowerISA 2.07 minimum.  */


/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_movemask_ps(__m128 __A) {

#ifdef _ARCH_PWR10

  return vec_extractm((__vector unsigned int)__A);

#else

  __vector unsigned long long __result;

  static const __vector unsigned int __perm_mask = {

#ifdef __LITTLE_ENDIAN__

      0x00204060, 0x80808080, 0x80808080, 0x80808080

#else

      0x80808080, 0x80808080, 0x80808080, 0x00204060

#endif

  };


  __result = ((__vector unsigned long long)vec_vbpermq(

      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));


#ifdef __LITTLE_ENDIAN__

  return __result[1];

#else

  return __result[0];

#endif

#endif /* !_ARCH_PWR10 */

}

#endif /* _ARCH_PWR8 */


/* Create a vector with all four elements equal to *P.  */

extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_load1_ps(float const *__P) {

  return _mm_set1_ps(*__P);

}


extern __inline __m128

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_load_ps1(float const *__P) {

  return _mm_load1_ps(__P);

}


/* Extracts one of the four words of A.  The selector N must be immediate.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_extract_pi16(__m64 const __A, int const __N) {

  unsigned int __shiftr = __N & 3;

#ifdef __BIG_ENDIAN__

  __shiftr = 3 - __shiftr;

#endif


  return ((__A >> (__shiftr * 16)) & 0xffff);

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pextrw(__m64 const __A, int const __N) {

  return _mm_extract_pi16(__A, __N);

}


/* Inserts word D into one of four words of A.  The selector N must be

   immediate.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {

  const int __shiftl = (__N & 3) * 16;

  const __m64 __shiftD = (const __m64)__D << __shiftl;

  const __m64 __mask = 0xffffUL << __shiftl;

  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);


  return __result;

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pinsrw(__m64 const __A, int const __D, int const __N) {

  return _mm_insert_pi16(__A, __D, __N);

}


/* Compute the element-wise maximum of signed 16-bit values.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))


    _mm_max_pi16(__m64 __A, __m64 __B) {

#if _ARCH_PWR8

  __vector signed short __a, __b, __r;

  __vector __bool short __c;


  __a = (__vector signed short)vec_splats(__A);

  __b = (__vector signed short)vec_splats(__B);

  __c = (__vector __bool short)vec_cmpgt(__a, __b);

  __r = vec_sel(__b, __a, __c);

  return (__m64)((__vector long long)__r)[0];

#else

  __m64_union __m1, __m2, __res;


  __m1.as_m64 = __A;

  __m2.as_m64 = __B;


  __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]

                                                            : __m2.as_short[0];

  __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]

                                                            : __m2.as_short[1];

  __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]

                                                            : __m2.as_short[2];

  __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]

                                                            : __m2.as_short[3];


  return (__m64)__res.as_m64;

#endif

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pmaxsw(__m64 __A, __m64 __B) {

  return _mm_max_pi16(__A, __B);

}


/* Compute the element-wise maximum of unsigned 8-bit values.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_max_pu8(__m64 __A, __m64 __B) {

#if _ARCH_PWR8

  __vector unsigned char __a, __b, __r;

  __vector __bool char __c;


  __a = (__vector unsigned char)vec_splats(__A);

  __b = (__vector unsigned char)vec_splats(__B);

  __c = (__vector __bool char)vec_cmpgt(__a, __b);

  __r = vec_sel(__b, __a, __c);

  return (__m64)((__vector long long)__r)[0];

#else

  __m64_union __m1, __m2, __res;

  long __i;


  __m1.as_m64 = __A;

  __m2.as_m64 = __B;


  for (__i = 0; __i < 8; __i++)

    __res.as_char[__i] =

        ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])

            ? __m1.as_char[__i]

            : __m2.as_char[__i];


  return (__m64)__res.as_m64;

#endif

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pmaxub(__m64 __A, __m64 __B) {

  return _mm_max_pu8(__A, __B);

}


/* Compute the element-wise minimum of signed 16-bit values.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_min_pi16(__m64 __A, __m64 __B) {

#if _ARCH_PWR8

  __vector signed short __a, __b, __r;

  __vector __bool short __c;


  __a = (__vector signed short)vec_splats(__A);

  __b = (__vector signed short)vec_splats(__B);

  __c = (__vector __bool short)vec_cmplt(__a, __b);

  __r = vec_sel(__b, __a, __c);

  return (__m64)((__vector long long)__r)[0];

#else

  __m64_union __m1, __m2, __res;


  __m1.as_m64 = __A;

  __m2.as_m64 = __B;


  __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]

                                                            : __m2.as_short[0];

  __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]

                                                            : __m2.as_short[1];

  __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]

                                                            : __m2.as_short[2];

  __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]

                                                            : __m2.as_short[3];


  return (__m64)__res.as_m64;

#endif

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pminsw(__m64 __A, __m64 __B) {

  return _mm_min_pi16(__A, __B);

}


/* Compute the element-wise minimum of unsigned 8-bit values.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_min_pu8(__m64 __A, __m64 __B) {

#if _ARCH_PWR8

  __vector unsigned char __a, __b, __r;

  __vector __bool char __c;


  __a = (__vector unsigned char)vec_splats(__A);

  __b = (__vector unsigned char)vec_splats(__B);

  __c = (__vector __bool char)vec_cmplt(__a, __b);

  __r = vec_sel(__b, __a, __c);

  return (__m64)((__vector long long)__r)[0];

#else

  __m64_union __m1, __m2, __res;

  long __i;


  __m1.as_m64 = __A;

  __m2.as_m64 = __B;


  for (__i = 0; __i < 8; __i++)

    __res.as_char[__i] =

        ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])

            ? __m1.as_char[__i]

            : __m2.as_char[__i];


  return (__m64)__res.as_m64;

#endif

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pminub(__m64 __A, __m64 __B) {

  return _mm_min_pu8(__A, __B);

}


/* Create an 8-bit mask of the signs of 8-bit values.  */

extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_movemask_pi8(__m64 __A) {

#ifdef __powerpc64__

  unsigned long long __p =

#ifdef __LITTLE_ENDIAN__

      0x0008101820283038UL; // permute control for sign bits

#else

      0x3830282018100800UL; // permute control for sign bits

#endif

  return __builtin_bpermd(__p, __A);

#else

#ifdef __LITTLE_ENDIAN__

  unsigned int __mask = 0x20283038UL;

  unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;

  unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;

#else

  unsigned int __mask = 0x38302820UL;

  unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;

  unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;

#endif

  return (__r2 << 4) | __r1;

#endif

}


extern __inline int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pmovmskb(__m64 __A) {

  return _mm_movemask_pi8(__A);

}


/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values

   in B and produce the high 16 bits of the 32-bit results.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_mulhi_pu16(__m64 __A, __m64 __B) {

  __vector unsigned short __a, __b;

  __vector unsigned short __c;

  __vector unsigned int __w0, __w1;

  __vector unsigned char __xform1 = {

#ifdef __LITTLE_ENDIAN__

      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,

      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F

#else

      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,

      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15

#endif

  };


  __a = (__vector unsigned short)vec_splats(__A);

  __b = (__vector unsigned short)vec_splats(__B);


  __w0 = vec_vmuleuh(__a, __b);

  __w1 = vec_vmulouh(__a, __b);

  __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);


  return (__m64)((__vector long long)__c)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pmulhuw(__m64 __A, __m64 __B) {

  return _mm_mulhi_pu16(__A, __B);

}


/* Return a combination of the four 16-bit values in A.  The selector

   must be an immediate.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_shuffle_pi16(__m64 __A, int const __N) {

  unsigned long __element_selector_10 = __N & 0x03;

  unsigned long __element_selector_32 = (__N >> 2) & 0x03;

  unsigned long __element_selector_54 = (__N >> 4) & 0x03;

  unsigned long __element_selector_76 = (__N >> 6) & 0x03;

  static const unsigned short __permute_selectors[4] = {

#ifdef __LITTLE_ENDIAN__

      0x0908, 0x0B0A, 0x0D0C, 0x0F0E

#else

      0x0607, 0x0405, 0x0203, 0x0001

#endif

  };

  __m64_union __t;

  __vector unsigned long long __a, __p, __r;


#ifdef __LITTLE_ENDIAN__

  __t.as_short[0] = __permute_selectors[__element_selector_10];

  __t.as_short[1] = __permute_selectors[__element_selector_32];

  __t.as_short[2] = __permute_selectors[__element_selector_54];

  __t.as_short[3] = __permute_selectors[__element_selector_76];

#else

  __t.as_short[3] = __permute_selectors[__element_selector_10];

  __t.as_short[2] = __permute_selectors[__element_selector_32];

  __t.as_short[1] = __permute_selectors[__element_selector_54];

  __t.as_short[0] = __permute_selectors[__element_selector_76];

#endif

  __p = vec_splats(__t.as_m64);

  __a = vec_splats(__A);

  __r = vec_perm(__a, __a, (__vector unsigned char)__p);

  return (__m64)((__vector long long)__r)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pshufw(__m64 __A, int const __N) {

  return _mm_shuffle_pi16(__A, __N);

}


/* Conditionally store byte elements of A into P.  The high bit of each

   byte in the selector N determines whether the corresponding byte from

   A is stored.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {

  __m64 __hibit = 0x8080808080808080UL;

  __m64 __mask, __tmp;

  __m64 *__p = (__m64 *)__P;


  __tmp = *__p;

  __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);

  __tmp = (__tmp & (~__mask)) | (__A & __mask);

  *__p = __tmp;

}


extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_maskmovq(__m64 __A, __m64 __N, char *__P) {

  _mm_maskmove_si64(__A, __N, __P);

}


/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_avg_pu8(__m64 __A, __m64 __B) {

  __vector unsigned char __a, __b, __c;


  __a = (__vector unsigned char)vec_splats(__A);

  __b = (__vector unsigned char)vec_splats(__B);

  __c = vec_avg(__a, __b);

  return (__m64)((__vector long long)__c)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pavgb(__m64 __A, __m64 __B) {

  return _mm_avg_pu8(__A, __B);

}


/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_avg_pu16(__m64 __A, __m64 __B) {

  __vector unsigned short __a, __b, __c;


  __a = (__vector unsigned short)vec_splats(__A);

  __b = (__vector unsigned short)vec_splats(__B);

  __c = vec_avg(__a, __b);

  return (__m64)((__vector long long)__c)[0];

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_pavgw(__m64 __A, __m64 __B) {

  return _mm_avg_pu16(__A, __B);

}


/* Compute the sum of the absolute differences of the unsigned 8-bit

   values in A and B.  Return the value in the lower 16-bit word; the

   upper words are cleared.  */

extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sad_pu8(__m64 __A, __m64 __B) {

  __vector unsigned char __a, __b;

  __vector unsigned char __vmin, __vmax, __vabsdiff;

  __vector signed int __vsum;

  const __vector unsigned int __zero = {0, 0, 0, 0};

  __m64_union __result = {0};


  __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};

  __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};

  __vmin = vec_min(__a, __b);

  __vmax = vec_max(__a, __b);

  __vabsdiff = vec_sub(__vmax, __vmin);

  /* Sum four groups of bytes into integers.  */

  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);

  /* Sum across four integers with integer result.  */

  __vsum = vec_sums(__vsum, (__vector signed int)__zero);

  /* The sum is in the right most 32-bits of the vector result.

     Transfer to a GPR and truncate to 16 bits.  */

  __result.as_short[0] = __vsum[3];

  return __result.as_m64;

}


extern __inline __m64

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _m_psadbw(__m64 __A, __m64 __B) {

  return _mm_sad_pu8(__A, __B);

}


/* Stores the data in A to the address P without polluting the caches.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_stream_pi(__m64 *__P, __m64 __A) {

  /* Use the data cache block touch for store transient.  */

  __asm__("  dcbtstt 0,%0" : : "b"(__P) : "memory");

  *__P = __A;

}


/* Likewise.  The address must be 16-byte aligned.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_stream_ps(float *__P, __m128 __A) {

  /* Use the data cache block touch for store transient.  */

  __asm__("  dcbtstt 0,%0" : : "b"(__P) : "memory");

  _mm_store_ps(__P, __A);

}


/* Guarantees that every preceding store is globally visible before

   any subsequent store.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_sfence(void) {

  /* Generate a light weight sync.  */

  __atomic_thread_fence(__ATOMIC_RELEASE);

}


/* The execution of the next instruction is delayed by an implementation

   specific amount of time.  The instruction does not modify the

   architectural state.  This is after the pop_options pragma because

   it does not require SSE support in the processor--the encoding is a

   nop on processors that do not support it.  */

extern __inline void

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mm_pause(void) {

  /* There is no exact match with this construct, but the following is

     close to the desired effect.  */

#if _ARCH_PWR8

  /* On power8 and later processors we can depend on Program Priority

     (PRI) and associated "very low" PPI setting.  Since we don't know

     what PPI this thread is running at we: 1) save the current PRI

     from the PPR SPR into a local GRP, 2) set the PRI to "very low*

     via the special or 31,31,31 encoding. 3) issue an "isync" to

     insure the PRI change takes effect before we execute any more

     instructions.

     Now we can execute a lwsync (release barrier) while we execute

     this thread at "very low" PRI.  Finally we restore the original

     PRI and continue execution.  */

  unsigned long __PPR;


  __asm__ volatile(" mfppr %0;"

                   "   or 31,31,31;"

                   "   isync;"

                   "   lwsync;"

                   "   isync;"

                   "   mtppr  %0;"

                   : "=r"(__PPR)

                   :

                   : "memory");

#else

  /* For older processor where we may not even have Program Priority

     controls we can only depend on Heavy Weight Sync.  */

  __atomic_thread_fence(__ATOMIC_SEQ_CST);

#endif

}


/* Transpose the 4x4 matrix composed of row[0-3].  */

#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \

  do {                                                                         \

    __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);         \

    __v4sf __t0 = vec_vmrghw(__r0, __r1);                                      \

    __v4sf __t1 = vec_vmrghw(__r2, __r3);                                      \

    __v4sf __t2 = vec_vmrglw(__r0, __r1);                                      \

    __v4sf __t3 = vec_vmrglw(__r2, __r3);                                      \

    (row0) = (__v4sf)vec_mergeh((__vector long long)__t0,                      \

                                (__vector long long)__t1);                     \

    (row1) = (__v4sf)vec_mergel((__vector long long)__t0,                      \

                                (__vector long long)__t1);                     \

    (row2) = (__v4sf)vec_mergeh((__vector long long)__t2,                      \

                                (__vector long long)__t3);                     \

    (row3) = (__v4sf)vec_mergel((__vector long long)__t2,                      \

                                (__vector long long)__t3);                     \

  } while (0)


/* For backward source compatibility.  */

//# include <emmintrin.h>


#else

#include_next <xmmintrin.h>

#endif /* defined(__powerpc64__) &&                                            \

        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */


#endif /* XMMINTRIN_H_ */

__attribute__
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Definition __clang_hip_libdevice_declares.h:285

int
__device__ int
Definition __clang_hip_libdevice_declares.h:68

float
__device__ float
Definition __clang_hip_libdevice_declares.h:28

altivec.h

vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition altivec.h:1708

vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition altivec.h:5326

vec_ctf
#define vec_ctf(__a, __b)
Definition altivec.h:3244

vec_vupkhsh
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition altivec.h:12731

__c
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800

__b
static __inline__ vector float vector float __b
Definition altivec.h:578

vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition altivec.h:4061

vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition altivec.h:14737

vec_vupkhsb
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition altivec.h:12712

vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition altivec.h:1235

vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition altivec.h:11184

vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition altivec.h:12487

vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition altivec.h:882

vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition altivec.h:1586

vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition altivec.h:5361

vec_vmrglw
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition altivec.h:5589

vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:7962

vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition altivec.h:8588

vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition altivec.h:5091

vec_vupklsh
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition altivec.h:12870

vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition altivec.h:2435

vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition altivec.h:4838

vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition altivec.h:6729

vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition altivec.h:2243

vec_pack
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition altivec.h:7389

vec_re
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition altivec.h:8263

vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition altivec.h:5742

vec_cts
#define vec_cts
Definition altivec.h:3319

vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition altivec.h:10090

vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition altivec.h:6865

vec_abs
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition altivec.h:117

vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition altivec.h:13207

vec_rsqrte
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition altivec.h:8541

vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition altivec.h:2131

vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition altivec.h:2369

vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition altivec.h:11869

__p
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57

__asm__
__asm__("swp %0, %1, [%2]" :"=r"(__v) :"r"(__x), "r"(__p) :"memory")

__a
static __inline__ void int __a
Definition emmintrin.h:4077

_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...

__D
static __inline__ void short __D
Definition immintrin.h:342

_mm_cmpeq_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition mmintrin.h:1178

as_short
#define as_short(x)
Definition opencl-c-base.h:599

__P
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25

__Y
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19

mm_malloc.h

mmintrin.h

_mm_comigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1181

_mm_cvttss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1503

_m_pinsrw
#define _m_pinsrw
Definition xmmintrin.h:3156

_mm_avg_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition xmmintrin.h:2560

_mm_movemask_pi8
static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition xmmintrin.h:2420

_mm_rcp_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition xmmintrin.h:270

_mm_move_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2804

_mm_cmplt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:582

_mm_cvt_pi2ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1710

_mm_sqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition xmmintrin.h:235

_MM_HINT_ET0
#define _MM_HINT_ET0
Definition xmmintrin.h:2185

_mm_cmpnge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:957

_mm_cmpeq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition xmmintrin.h:534

_mm_cvtpi16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2862

_mm_cmplt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:560

_mm_avg_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition xmmintrin.h:2541

_mm_cvt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1481

_mm_cvt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1415

_mm_cvtpi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition xmmintrin.h:1684

_mm_cmpeq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:513

_mm_mulhi_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition xmmintrin.h:2438

_mm_mul_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:160

_mm_load_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition xmmintrin.h:1846

_mm_cmpneq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition xmmintrin.h:778

_mm_comile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1157

_mm_cvtpi32x2_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition xmmintrin.h:2943

_mm_storer_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition xmmintrin.h:2179

_mm_undefined_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition xmmintrin.h:1899

_mm_cmpnle_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:855

_mm_set_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition xmmintrin.h:1980

_mm_ucomilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1276

_m_pmulhuw
#define _m_pmulhuw
Definition xmmintrin.h:3162

_mm_sub_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition xmmintrin.h:119

_mm_cvtsi32_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1613

_m_pmaxub
#define _m_pmaxub
Definition xmmintrin.h:3158

_mm_unpacklo_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition xmmintrin.h:2783

_mm_add_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition xmmintrin.h:79

_mm_cvtss_f32
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition xmmintrin.h:1727

_mm_cmple_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:608

_mm_sad_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition xmmintrin.h:2582

_mm_add_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition xmmintrin.h:98

_mm_rsqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition xmmintrin.h:323

_mm_xor_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:489

_mm_storel_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2058

_mm_set_ps1
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1954

_m_pmaxsw
#define _m_pmaxsw
Definition xmmintrin.h:3157

_mm_ucomile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1300

_mm_cmpge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:730

_mm_shuffle_ps
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition xmmintrin.h:2741

_m_pavgw
#define _m_pavgw
Definition xmmintrin.h:3166

_mm_comieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition xmmintrin.h:1108

_mm_store1_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2140

_mm_sfence
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...

_mm_load_ps1
#define _mm_load_ps1(p)
Definition xmmintrin.h:1832

_mm_cvt_si2ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition xmmintrin.h:1635

_mm_set1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition xmmintrin.h:1936

_mm_unpackhi_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition xmmintrin.h:2762

_MM_HINT_ET1
#define _MM_HINT_ET1
Definition xmmintrin.h:2186

_mm_div_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition xmmintrin.h:218

_mm_max_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition xmmintrin.h:415

_m_pextrw
#define _m_pextrw
Definition xmmintrin.h:3155

_mm_min_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2385

_mm_rsqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition xmmintrin.h:306

_mm_ucomige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1348

_mm_andnot_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition xmmintrin.h:454

_mm_comilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1133

_mm_loadl_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition xmmintrin.h:1774

_mm_storeu_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition xmmintrin.h:2100

_mm_movehl_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2825

_mm_load1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition xmmintrin.h:1823

_mm_min_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition xmmintrin.h:369

_mm_stream_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition xmmintrin.h:2257

_mm_stream_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition xmmintrin.h:2238

_m_pavgb
#define _m_pavgb
Definition xmmintrin.h:3165

_m_pmovmskb
#define _m_pmovmskb
Definition xmmintrin.h:3161

_mm_comige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1205

_mm_cvtss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition xmmintrin.h:1393

_mm_cmpgt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:680

_mm_max_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition xmmintrin.h:2349

_m_psadbw
#define _m_psadbw
Definition xmmintrin.h:3167

_mm_ucomigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1324

_mm_cvtps_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2968

_mm_movelh_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:2845

_mm_store_ss
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2079

_mm_cmpngt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:905

_mm_loadh_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition xmmintrin.h:1747

_mm_cvtpi8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition xmmintrin.h:2898

_mm_rcp_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:287

_mm_store_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition xmmintrin.h:2121

_MM_HINT_T0
#define _MM_HINT_T0
Definition xmmintrin.h:2187

_mm_extract_pi16
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition xmmintrin.h:2301

_mm_or_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:471

_mm_sqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition xmmintrin.h:252

_mm_cmpneq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition xmmintrin.h:756

_mm_prefetch
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition xmmintrin.h:2221

_mm_movemask_ps
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition xmmintrin.h:3017

_m_pminsw
#define _m_pminsw
Definition xmmintrin.h:3159

_mm_cvtpu8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition xmmintrin.h:2919

_mm_cvtpu16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition xmmintrin.h:2880

_mm_cvttps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1571

_mm_cvtps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition xmmintrin.h:1461

_mm_cvtt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition xmmintrin.h:1525

_mm_cvtps_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition xmmintrin.h:2993

_mm_and_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition xmmintrin.h:433

_mm_max_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2367

_mm_loadr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition xmmintrin.h:1885

_mm_cmpord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1009

_mm_cmpnlt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:828

_mm_storeh_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition xmmintrin.h:2037

_MM_HINT_T1
#define _MM_HINT_T1
Definition xmmintrin.h:2188

_mm_cmpngt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:930

_m_pshufw
#define _m_pshufw
Definition xmmintrin.h:3163

_mm_cmpnge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:982

_m_maskmovq
#define _m_maskmovq
Definition xmmintrin.h:3164

_mm_insert_pi16
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition xmmintrin.h:2332

_mm_shuffle_pi16
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition xmmintrin.h:2478

_mm_cmpord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1033

_mm_setzero_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition xmmintrin.h:2021

_mm_cmpgt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:656

_mm_ucomieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1252

_mm_maskmove_si64
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition xmmintrin.h:2507

_mm_cmpnlt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:805

_mm_store_ps1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition xmmintrin.h:2160

_mm_setr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition xmmintrin.h:2007

_mm_min_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition xmmintrin.h:348

_mm_cmple_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:630

_mm_sub_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition xmmintrin.h:139

_mm_max_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition xmmintrin.h:394

_m_pminub
#define _m_pminub
Definition xmmintrin.h:3160

_mm_div_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition xmmintrin.h:200

_mm_ucomineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition xmmintrin.h:1371

_MM_HINT_NTA
#define _MM_HINT_NTA
Definition xmmintrin.h:2190

_mm_comineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition xmmintrin.h:1229

_mm_cvtt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition xmmintrin.h:1592

_mm_set_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1919

_mm_min_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition xmmintrin.h:2403

_mm_cmpnle_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:878

_MM_HINT_T2
#define _MM_HINT_T2
Definition xmmintrin.h:2189

_mm_cmpunord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:1060

_mm_cmpge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition xmmintrin.h:706

_mm_cmpunord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition xmmintrin.h:1084

_mm_loadu_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition xmmintrin.h:1863

_mm_load_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition xmmintrin.h:1801

_mm_mul_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition xmmintrin.h:179