doxygen/ppc__wrappers_2bmi2intrin_8h_source.html

/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===

 *

 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 * See https://llvm.org/LICENSE.txt for license information.

 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 *

 *===-----------------------------------------------------------------------===

 */


#if !defined X86GPRINTRIN_H_

#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."

#endif


#ifndef BMI2INTRIN_H_

#define BMI2INTRIN_H_


extern __inline unsigned int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _bzhi_u32(unsigned int __X, unsigned int __Y) {

  return ((__X << (32 - __Y)) >> (32 - __Y));

}


extern __inline unsigned int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {

  unsigned long long __res = (unsigned long long)__X * __Y;

  *__P = (unsigned int)(__res >> 32);

  return (unsigned int)__res;

}


#ifdef __PPC64__

extern __inline unsigned long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {

  return ((__X << (64 - __Y)) >> (64 - __Y));

}


/* __int128 requires base 64-bit.  */

extern __inline unsigned long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _mulx_u64(unsigned long long __X, unsigned long long __Y,

              unsigned long long *__P) {

  unsigned __int128 __res = (unsigned __int128)__X * __Y;

  *__P = (unsigned long long)(__res >> 64);

  return (unsigned long long)__res;

}


#ifdef _ARCH_PWR7

/* popcount and bpermd require power7 minimum.  */

extern __inline unsigned long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _pdep_u64(unsigned long long __X, unsigned long long __M) {

  unsigned long __result = 0x0UL;

  const unsigned long __mask = 0x8000000000000000UL;

  unsigned long __m = __M;

  unsigned long __c, __t;

  unsigned long __p;


  /* The pop-count of the mask gives the number of the bits from

   source to process.  This is also needed to shift bits from the

   source into the correct position for the result.  */

  __p = 64 - __builtin_popcountl(__M);


  /* The loop is for the number of '1' bits in the mask and clearing

   each mask bit as it is processed.  */

  while (__m != 0) {

    __c = __builtin_clzl(__m);

    __t = __X << (__p - __c);

    __m ^= (__mask >> __c);

    __result |= (__t & (__mask >> __c));

    __p++;

  }

  return __result;

}


extern __inline unsigned long long

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _pext_u64(unsigned long long __X, unsigned long long __M) {

  unsigned long __p = 0x4040404040404040UL; // initial bit permute control

  const unsigned long __mask = 0x8000000000000000UL;

  unsigned long __m = __M;

  unsigned long __c;

  unsigned long __result;


  /* if the mask is constant and selects 8 bits or less we can use

   the Power8 Bit permute instruction.  */

  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {

    /* Also if the pext mask is constant, then the popcount is

     constant, we can evaluate the following loop at compile

     time and use a constant bit permute vector.  */

    long __i;

    for (__i = 0; __i < __builtin_popcountl(__M); __i++) {

      __c = __builtin_clzl(__m);

      __p = (__p << 8) | __c;

      __m ^= (__mask >> __c);

    }

    __result = __builtin_bpermd(__p, __X);

  } else {

    __p = 64 - __builtin_popcountl(__M);

    __result = 0;

    /* We could a use a for loop here, but that combined with

     -funroll-loops can expand to a lot of code.  The while

     loop avoids unrolling and the compiler commons the xor

     from clearing the mask bit with the (m != 0) test.  The

     result is a more compact loop setup and body.  */

    while (__m != 0) {

      unsigned long __t;

      __c = __builtin_clzl(__m);

      __t = (__X & (__mask >> __c)) >> (__p - __c);

      __m ^= (__mask >> __c);

      __result |= (__t);

      __p++;

    }

  }

  return __result;

}


/* these 32-bit implementations depend on 64-bit pdep/pext

   which depend on _ARCH_PWR7.  */

extern __inline unsigned int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _pdep_u32(unsigned int __X, unsigned int __Y) {

  return _pdep_u64(__X, __Y);

}


extern __inline unsigned int

    __attribute__((__gnu_inline__, __always_inline__, __artificial__))

    _pext_u32(unsigned int __X, unsigned int __Y) {

  return _pext_u64(__X, __Y);

}

#endif /* _ARCH_PWR7  */

#endif /* __PPC64__  */


#endif /* BMI2INTRIN_H_ */

__attribute__
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Definition __clang_hip_libdevice_declares.h:285

int
__device__ int
Definition __clang_hip_libdevice_declares.h:68

__c
static __inline__ vector float vector float vector float __c
Definition altivec.h:4800

__p
static __inline__ uint32_t volatile uint32_t * __p
Definition arm_acle.h:57

_pext_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pext_u32(unsigned int __X, unsigned int __Y)
Extract (gather) bits from the unsigned 32-bit integer __X into the low-order bits of the 32-bit resu...
Definition bmi2intrin.h:105

_bzhi_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _bzhi_u32(unsigned int __X, unsigned int __Y)
Copies the unsigned 32-bit integer __X and zeroes the upper bits starting at bit number __Y.
Definition bmi2intrin.h:47

_mulx_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
Multiplies the unsigned 32-bit integers __X and __Y to form a 64-bit product.
Definition bmi2intrin.h:130

_pdep_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pdep_u32(unsigned int __X, unsigned int __Y)
Deposit (scatter) low-order bits from the unsigned 32-bit integer __X into the 32-bit result,...
Definition bmi2intrin.h:76

__P
__inline unsigned int unsigned int unsigned int * __P
Definition bmi2intrin.h:25

__Y
__inline unsigned int unsigned int __Y
Definition bmi2intrin.h:19