clang 20.0.0git
bmi2intrin.h
Go to the documentation of this file.
1/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#if !defined X86GPRINTRIN_H_
11#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
12#endif
13
14#ifndef BMI2INTRIN_H_
15#define BMI2INTRIN_H_
16
17extern __inline unsigned int
18 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
19 _bzhi_u32(unsigned int __X, unsigned int __Y) {
20 return ((__X << (32 - __Y)) >> (32 - __Y));
21}
22
23extern __inline unsigned int
24 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
25 _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
26 unsigned long long __res = (unsigned long long)__X * __Y;
27 *__P = (unsigned int)(__res >> 32);
28 return (unsigned int)__res;
29}
30
31#ifdef __PPC64__
32extern __inline unsigned long long
33 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
34 _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
35 return ((__X << (64 - __Y)) >> (64 - __Y));
36}
37
38/* __int128 requires base 64-bit. */
39extern __inline unsigned long long
40 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41 _mulx_u64(unsigned long long __X, unsigned long long __Y,
42 unsigned long long *__P) {
43 unsigned __int128 __res = (unsigned __int128)__X * __Y;
44 *__P = (unsigned long long)(__res >> 64);
45 return (unsigned long long)__res;
46}
47
48#ifdef _ARCH_PWR7
49/* popcount and bpermd require power7 minimum. */
50extern __inline unsigned long long
51 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
52 _pdep_u64(unsigned long long __X, unsigned long long __M) {
53 unsigned long __result = 0x0UL;
54 const unsigned long __mask = 0x8000000000000000UL;
55 unsigned long __m = __M;
56 unsigned long __c, __t;
57 unsigned long __p;
58
59 /* The pop-count of the mask gives the number of the bits from
60 source to process. This is also needed to shift bits from the
61 source into the correct position for the result. */
62 __p = 64 - __builtin_popcountl(__M);
63
64 /* The loop is for the number of '1' bits in the mask and clearing
65 each mask bit as it is processed. */
66 while (__m != 0) {
67 __c = __builtin_clzl(__m);
68 __t = __X << (__p - __c);
69 __m ^= (__mask >> __c);
70 __result |= (__t & (__mask >> __c));
71 __p++;
72 }
73 return __result;
74}
75
76extern __inline unsigned long long
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _pext_u64(unsigned long long __X, unsigned long long __M) {
79 unsigned long __p = 0x4040404040404040UL; // initial bit permute control
80 const unsigned long __mask = 0x8000000000000000UL;
81 unsigned long __m = __M;
82 unsigned long __c;
83 unsigned long __result;
84
85 /* if the mask is constant and selects 8 bits or less we can use
86 the Power8 Bit permute instruction. */
87 if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
88 /* Also if the pext mask is constant, then the popcount is
89 constant, we can evaluate the following loop at compile
90 time and use a constant bit permute vector. */
91 long __i;
92 for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
93 __c = __builtin_clzl(__m);
94 __p = (__p << 8) | __c;
95 __m ^= (__mask >> __c);
96 }
97 __result = __builtin_bpermd(__p, __X);
98 } else {
99 __p = 64 - __builtin_popcountl(__M);
100 __result = 0;
101 /* We could a use a for loop here, but that combined with
102 -funroll-loops can expand to a lot of code. The while
103 loop avoids unrolling and the compiler commons the xor
104 from clearing the mask bit with the (m != 0) test. The
105 result is a more compact loop setup and body. */
106 while (__m != 0) {
107 unsigned long __t;
108 __c = __builtin_clzl(__m);
109 __t = (__X & (__mask >> __c)) >> (__p - __c);
110 __m ^= (__mask >> __c);
111 __result |= (__t);
112 __p++;
113 }
114 }
115 return __result;
116}
117
118/* these 32-bit implementations depend on 64-bit pdep/pext
119 which depend on _ARCH_PWR7. */
120extern __inline unsigned int
121 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 _pdep_u32(unsigned int __X, unsigned int __Y) {
123 return _pdep_u64(__X, __Y);
124}
125
126extern __inline unsigned int
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _pext_u32(unsigned int __X, unsigned int __Y) {
129 return _pext_u64(__X, __Y);
130}
131#endif /* _ARCH_PWR7 */
132#endif /* __PPC64__ */
133
134#endif /* BMI2INTRIN_H_ */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ int
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pext_u32(unsigned int __X, unsigned int __Y)
Extract (gather) bits from the unsigned 32-bit integer __X into the low-order bits of the 32-bit resu...
Definition: bmi2intrin.h:105
static __inline__ unsigned int __DEFAULT_FN_ATTRS _bzhi_u32(unsigned int __X, unsigned int __Y)
Copies the unsigned 32-bit integer __X and zeroes the upper bits starting at bit number __Y.
Definition: bmi2intrin.h:47
static __inline__ unsigned int __DEFAULT_FN_ATTRS _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
Multiplies the unsigned 32-bit integers __X and __Y to form a 64-bit product.
Definition: bmi2intrin.h:130
static __inline__ unsigned int __DEFAULT_FN_ATTRS _pdep_u32(unsigned int __X, unsigned int __Y)
Deposit (scatter) low-order bits from the unsigned 32-bit integer __X into the 32-bit result,...
Definition: bmi2intrin.h:76
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19