clang  15.0.0git
bmi2intrin.h
Go to the documentation of this file.
1 /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #if !defined X86GPRINTRIN_H_
11 #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
12 #endif
13 
14 #ifndef BMI2INTRIN_H_
15 #define BMI2INTRIN_H_
16 
17 extern __inline unsigned int
18  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
19  _bzhi_u32(unsigned int __X, unsigned int __Y) {
20  return ((__X << (32 - __Y)) >> (32 - __Y));
21 }
22 
23 extern __inline unsigned int
24  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
25  _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
26  unsigned long long __res = (unsigned long long)__X * __Y;
27  *__P = (unsigned int)(__res >> 32);
28  return (unsigned int)__res;
29 }
30 
31 #ifdef __PPC64__
32 extern __inline unsigned long long
33  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
34  _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
35  return ((__X << (64 - __Y)) >> (64 - __Y));
36 }
37 
38 /* __int128 requires base 64-bit. */
39 extern __inline unsigned long long
40  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41  _mulx_u64(unsigned long long __X, unsigned long long __Y,
42  unsigned long long *__P) {
43  unsigned __int128 __res = (unsigned __int128)__X * __Y;
44  *__P = (unsigned long long)(__res >> 64);
45  return (unsigned long long)__res;
46 }
47 
48 #ifdef _ARCH_PWR7
49 /* popcount and bpermd require power7 minimum. */
50 extern __inline unsigned long long
51  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52  _pdep_u64(unsigned long long __X, unsigned long long __M) {
53  unsigned long __result = 0x0UL;
54  const unsigned long __mask = 0x8000000000000000UL;
55  unsigned long __m = __M;
56  unsigned long __c, __t;
57  unsigned long __p;
58 
59  /* The pop-count of the mask gives the number of the bits from
60  source to process. This is also needed to shift bits from the
61  source into the correct position for the result. */
62  __p = 64 - __builtin_popcountl(__M);
63 
64  /* The loop is for the number of '1' bits in the mask and clearing
65  each mask bit as it is processed. */
66  while (__m != 0) {
67  __c = __builtin_clzl(__m);
68  __t = __X << (__p - __c);
69  __m ^= (__mask >> __c);
70  __result |= (__t & (__mask >> __c));
71  __p++;
72  }
73  return __result;
74 }
75 
76 extern __inline unsigned long long
77  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78  _pext_u64(unsigned long long __X, unsigned long long __M) {
79  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
80  const unsigned long __mask = 0x8000000000000000UL;
81  unsigned long __m = __M;
82  unsigned long __c;
83  unsigned long __result;
84 
85  /* if the mask is constant and selects 8 bits or less we can use
86  the Power8 Bit permute instruction. */
87  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
88  /* Also if the pext mask is constant, then the popcount is
89  constant, we can evaluate the following loop at compile
90  time and use a constant bit permute vector. */
91  long __i;
92  for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
93  __c = __builtin_clzl(__m);
94  __p = (__p << 8) | __c;
95  __m ^= (__mask >> __c);
96  }
97  __result = __builtin_bpermd(__p, __X);
98  } else {
99  __p = 64 - __builtin_popcountl(__M);
100  __result = 0;
101  /* We could a use a for loop here, but that combined with
102  -funroll-loops can expand to a lot of code. The while
103  loop avoids unrolling and the compiler commons the xor
104  from clearing the mask bit with the (m != 0) test. The
105  result is a more compact loop setup and body. */
106  while (__m != 0) {
107  unsigned long __t;
108  __c = __builtin_clzl(__m);
109  __t = (__X & (__mask >> __c)) >> (__p - __c);
110  __m ^= (__mask >> __c);
111  __result |= (__t);
112  __p++;
113  }
114  }
115  return __result;
116 }
117 
118 /* these 32-bit implementations depend on 64-bit pdep/pext
119  which depend on _ARCH_PWR7. */
120 extern __inline unsigned int
121  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122  _pdep_u32(unsigned int __X, unsigned int __Y) {
123  return _pdep_u64(__X, __Y);
124 }
125 
126 extern __inline unsigned int
127  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128  _pext_u32(unsigned int __X, unsigned int __Y) {
129  return _pext_u64(__X, __Y);
130 }
131 #endif /* _ARCH_PWR7 */
132 #endif /* __PPC64__ */
133 
134 #endif /* BMI2INTRIN_H_ */
int
__device__ int
Definition: __clang_hip_libdevice_declares.h:63
_pdep_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pdep_u32(unsigned int __X, unsigned int __Y)
Definition: bmi2intrin.h:27
__P
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__Y
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
_mulx_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
Definition: bmi2intrin.h:70
_pext_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pext_u32(unsigned int __X, unsigned int __Y)
Definition: bmi2intrin.h:33
__attribute__
__inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _bzhi_u32(unsigned int __X
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4788
_bzhi_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS _bzhi_u32(unsigned int __X, unsigned int __Y)
Definition: bmi2intrin.h:21