clang  16.0.0git
smmintrin.h
Go to the documentation of this file.
1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0.
12 
13  NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */
14 
15 #ifndef NO_WARN_X86_INTRINSICS
16 /* This header is distributed to simplify porting x86_64 code that
17  makes explicit use of Intel intrinsics to powerp64/powerpc64le.
18 
19  It is the user's responsibility to determine if the results are
20  acceptable and make additional changes as necessary.
21 
22  Note that much code that uses Intel intrinsics can be rewritten in
23  standard C or GNU C extensions, which are more portable and better
24  optimized across multiple targets. */
25 #error \
26  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
27 #endif
28 
29 #ifndef SMMINTRIN_H_
30 #define SMMINTRIN_H_
31 
32 #if defined(__ppc64__) && \
33  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34 
35 #include <altivec.h>
36 #include <tmmintrin.h>
37 
38 /* Rounding mode macros. */
39 #define _MM_FROUND_TO_NEAREST_INT 0x00
40 #define _MM_FROUND_TO_ZERO 0x01
41 #define _MM_FROUND_TO_POS_INF 0x02
42 #define _MM_FROUND_TO_NEG_INF 0x03
43 #define _MM_FROUND_CUR_DIRECTION 0x04
44 
45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51 
52 #define _MM_FROUND_RAISE_EXC 0x00
53 #define _MM_FROUND_NO_EXC 0x08
54 
55 extern __inline __m128d
56  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57  _mm_round_pd(__m128d __A, int __rounding) {
58  __v2df __r;
59  union {
60  double __fr;
61  long long __fpscr;
62  } __enables_save, __fpscr_save;
63 
64  if (__rounding & _MM_FROUND_NO_EXC) {
65  /* Save enabled exceptions, disable all exceptions,
66  and preserve the rounding mode. */
67 #ifdef _ARCH_PWR9
68  __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70 #else
71  __fpscr_save.__fr = __builtin_mffs();
72  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73  __fpscr_save.__fpscr &= ~0xf8;
74  __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
75 #endif
76  /* Insert an artificial "read/write" reference to the variable
77  read below, to ensure the compiler does not schedule
78  a read/use of the variable before the FPSCR is modified, above.
79  This can be removed if and when GCC PR102783 is fixed.
80  */
81  __asm__("" : "+wa"(__A));
82  }
83 
84  switch (__rounding) {
86  __fpscr_save.__fr = __builtin_mffsl();
87  __attribute__((fallthrough));
89  __builtin_set_fpscr_rn(0b00);
90  /* Insert an artificial "read/write" reference to the variable
91  read below, to ensure the compiler does not schedule
92  a read/use of the variable before the FPSCR is modified, above.
93  This can be removed if and when GCC PR102783 is fixed.
94  */
95  __asm__("" : "+wa"(__A));
96 
97  __r = vec_rint((__v2df)__A);
98 
99  /* Insert an artificial "read" reference to the variable written
100  above, to ensure the compiler does not schedule the computation
101  of the value after the manipulation of the FPSCR, below.
102  This can be removed if and when GCC PR102783 is fixed.
103  */
104  __asm__("" : : "wa"(__r));
105  __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
106  break;
109  __r = vec_floor((__v2df)__A);
110  break;
113  __r = vec_ceil((__v2df)__A);
114  break;
115  case _MM_FROUND_TO_ZERO:
117  __r = vec_trunc((__v2df)__A);
118  break;
120  __r = vec_rint((__v2df)__A);
121  break;
122  }
123  if (__rounding & _MM_FROUND_NO_EXC) {
124  /* Insert an artificial "read" reference to the variable written
125  above, to ensure the compiler does not schedule the computation
126  of the value after the manipulation of the FPSCR, below.
127  This can be removed if and when GCC PR102783 is fixed.
128  */
129  __asm__("" : : "wa"(__r));
130  /* Restore enabled exceptions. */
131  __fpscr_save.__fr = __builtin_mffsl();
132  __fpscr_save.__fpscr |= __enables_save.__fpscr;
133  __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
134  }
135  return (__m128d)__r;
136 }
137 
138 extern __inline __m128d
139  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140  _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
141  __B = _mm_round_pd(__B, __rounding);
142  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
143  return (__m128d)__r;
144 }
145 
146 extern __inline __m128
147  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148  _mm_round_ps(__m128 __A, int __rounding) {
149  __v4sf __r;
150  union {
151  double __fr;
152  long long __fpscr;
153  } __enables_save, __fpscr_save;
154 
155  if (__rounding & _MM_FROUND_NO_EXC) {
156  /* Save enabled exceptions, disable all exceptions,
157  and preserve the rounding mode. */
158 #ifdef _ARCH_PWR9
159  __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
160  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
161 #else
162  __fpscr_save.__fr = __builtin_mffs();
163  __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
164  __fpscr_save.__fpscr &= ~0xf8;
165  __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
166 #endif
167  /* Insert an artificial "read/write" reference to the variable
168  read below, to ensure the compiler does not schedule
169  a read/use of the variable before the FPSCR is modified, above.
170  This can be removed if and when GCC PR102783 is fixed.
171  */
172  __asm__("" : "+wa"(__A));
173  }
174 
175  switch (__rounding) {
177  __fpscr_save.__fr = __builtin_mffsl();
178  __attribute__((fallthrough));
180  __builtin_set_fpscr_rn(0b00);
181  /* Insert an artificial "read/write" reference to the variable
182  read below, to ensure the compiler does not schedule
183  a read/use of the variable before the FPSCR is modified, above.
184  This can be removed if and when GCC PR102783 is fixed.
185  */
186  __asm__("" : "+wa"(__A));
187 
188  __r = vec_rint((__v4sf)__A);
189 
190  /* Insert an artificial "read" reference to the variable written
191  above, to ensure the compiler does not schedule the computation
192  of the value after the manipulation of the FPSCR, below.
193  This can be removed if and when GCC PR102783 is fixed.
194  */
195  __asm__("" : : "wa"(__r));
196  __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
197  break;
200  __r = vec_floor((__v4sf)__A);
201  break;
204  __r = vec_ceil((__v4sf)__A);
205  break;
206  case _MM_FROUND_TO_ZERO:
208  __r = vec_trunc((__v4sf)__A);
209  break;
211  __r = vec_rint((__v4sf)__A);
212  break;
213  }
214  if (__rounding & _MM_FROUND_NO_EXC) {
215  /* Insert an artificial "read" reference to the variable written
216  above, to ensure the compiler does not schedule the computation
217  of the value after the manipulation of the FPSCR, below.
218  This can be removed if and when GCC PR102783 is fixed.
219  */
220  __asm__("" : : "wa"(__r));
221  /* Restore enabled exceptions. */
222  __fpscr_save.__fr = __builtin_mffsl();
223  __fpscr_save.__fpscr |= __enables_save.__fpscr;
224  __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
225  }
226  return (__m128)__r;
227 }
228 
229 extern __inline __m128
230  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231  _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
232  __B = _mm_round_ps(__B, __rounding);
233  __v4sf __r = (__v4sf)__A;
234  __r[0] = ((__v4sf)__B)[0];
235  return (__m128)__r;
236 }
237 
238 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
239 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
240 
241 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
242 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
243 
244 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
245 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
246 
247 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
248 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
249 
250 extern __inline __m128i
251  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252  _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
253  __v16qi __result = (__v16qi)__A;
254 
255  __result[__N & 0xf] = __D;
256 
257  return (__m128i)__result;
258 }
259 
260 extern __inline __m128i
261  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262  _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
263  __v4si __result = (__v4si)__A;
264 
265  __result[__N & 3] = __D;
266 
267  return (__m128i)__result;
268 }
269 
270 extern __inline __m128i
271  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272  _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
273  __v2di __result = (__v2di)__A;
274 
275  __result[__N & 1] = __D;
276 
277  return (__m128i)__result;
278 }
279 
280 extern __inline int
281  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282  _mm_extract_epi8(__m128i __X, const int __N) {
283  return (unsigned char)((__v16qi)__X)[__N & 15];
284 }
285 
286 extern __inline int
287  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288  _mm_extract_epi32(__m128i __X, const int __N) {
289  return ((__v4si)__X)[__N & 3];
290 }
291 
292 extern __inline int
293  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294  _mm_extract_epi64(__m128i __X, const int __N) {
295  return ((__v2di)__X)[__N & 1];
296 }
297 
298 extern __inline int
299  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300  _mm_extract_ps(__m128 __X, const int __N) {
301  return ((__v4si)__X)[__N & 3];
302 }
303 
304 #ifdef _ARCH_PWR8
305 extern __inline __m128i
306  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307  _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
308  __v16qi __charmask = vec_splats((signed char)__imm8);
309  __charmask = vec_gb(__charmask);
310  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
311 #ifdef __BIG_ENDIAN__
312  __shortmask = vec_reve(__shortmask);
313 #endif
314  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
315 }
316 #endif
317 
318 extern __inline __m128i
319  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320  _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
321 #ifdef _ARCH_PWR10
322  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
323 #else
324  const __v16qu __seven = vec_splats((unsigned char)0x07);
325  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
326  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
327 #endif
328 }
329 
330 extern __inline __m128
331  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332  _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
333  __v16qu __pcv[] = {
334  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
335  {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
336  {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
337  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
338  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
339  {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
340  {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
341  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
342  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
343  {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
344  {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
345  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
346  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
347  {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
348  {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
349  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
350  };
351  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
352  return (__m128)__r;
353 }
354 
355 extern __inline __m128
356  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357  _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
358 #ifdef _ARCH_PWR10
359  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
360 #else
361  const __v4si __zero = {0};
362  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
363  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
364 #endif
365 }
366 
367 extern __inline __m128d
368  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369  _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
370  __v16qu __pcv[] = {
371  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
372  {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
373  {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
374  {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
375  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
376  return (__m128d)__r;
377 }
378 
379 #ifdef _ARCH_PWR8
380 extern __inline __m128d
381  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382  _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
383 #ifdef _ARCH_PWR10
384  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
385 #else
386  const __v2di __zero = {0};
387  const __vector __bool long long __boolmask =
388  vec_cmplt((__v2di)__mask, __zero);
389  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
390 #endif
391 }
392 #endif
393 
394 extern __inline int
395  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396  _mm_testz_si128(__m128i __A, __m128i __B) {
397  /* Note: This implementation does NOT set "zero" or "carry" flags. */
398  const __v16qu __zero = {0};
399  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
400 }
401 
402 extern __inline int
403  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404  _mm_testc_si128(__m128i __A, __m128i __B) {
405  /* Note: This implementation does NOT set "zero" or "carry" flags. */
406  const __v16qu __zero = {0};
407  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
408  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
409 }
410 
411 extern __inline int
412  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413  _mm_testnzc_si128(__m128i __A, __m128i __B) {
414  /* Note: This implementation does NOT set "zero" or "carry" flags. */
415  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
416 }
417 
418 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
419 
420 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
421 
422 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
423 
424 #ifdef _ARCH_PWR8
425 extern __inline __m128i
426  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427  _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
428  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
429 }
430 #endif
431 
432 extern __inline __m128i
433  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434  _mm_min_epi8(__m128i __X, __m128i __Y) {
435  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
436 }
437 
438 extern __inline __m128i
439  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
440  _mm_min_epu16(__m128i __X, __m128i __Y) {
441  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
442 }
443 
444 extern __inline __m128i
445  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446  _mm_min_epi32(__m128i __X, __m128i __Y) {
447  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
448 }
449 
450 extern __inline __m128i
451  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452  _mm_min_epu32(__m128i __X, __m128i __Y) {
453  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
454 }
455 
456 extern __inline __m128i
457  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458  _mm_max_epi8(__m128i __X, __m128i __Y) {
459  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
460 }
461 
462 extern __inline __m128i
463  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464  _mm_max_epu16(__m128i __X, __m128i __Y) {
465  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
466 }
467 
468 extern __inline __m128i
469  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470  _mm_max_epi32(__m128i __X, __m128i __Y) {
471  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
472 }
473 
474 extern __inline __m128i
475  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476  _mm_max_epu32(__m128i __X, __m128i __Y) {
477  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
478 }
479 
480 extern __inline __m128i
481  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482  _mm_mullo_epi32(__m128i __X, __m128i __Y) {
483  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
484 }
485 
486 #ifdef _ARCH_PWR8
487 extern __inline __m128i
488  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489  _mm_mul_epi32(__m128i __X, __m128i __Y) {
490  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
491 }
492 #endif
493 
494 extern __inline __m128i
495  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496  _mm_cvtepi8_epi16(__m128i __A) {
497  return (__m128i)vec_unpackh((__v16qi)__A);
498 }
499 
500 extern __inline __m128i
501  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502  _mm_cvtepi8_epi32(__m128i __A) {
503  __A = (__m128i)vec_unpackh((__v16qi)__A);
504  return (__m128i)vec_unpackh((__v8hi)__A);
505 }
506 
507 #ifdef _ARCH_PWR8
508 extern __inline __m128i
509  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510  _mm_cvtepi8_epi64(__m128i __A) {
511  __A = (__m128i)vec_unpackh((__v16qi)__A);
512  __A = (__m128i)vec_unpackh((__v8hi)__A);
513  return (__m128i)vec_unpackh((__v4si)__A);
514 }
515 #endif
516 
517 extern __inline __m128i
518  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519  _mm_cvtepi16_epi32(__m128i __A) {
520  return (__m128i)vec_unpackh((__v8hi)__A);
521 }
522 
523 #ifdef _ARCH_PWR8
524 extern __inline __m128i
525  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526  _mm_cvtepi16_epi64(__m128i __A) {
527  __A = (__m128i)vec_unpackh((__v8hi)__A);
528  return (__m128i)vec_unpackh((__v4si)__A);
529 }
530 #endif
531 
532 #ifdef _ARCH_PWR8
533 extern __inline __m128i
534  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535  _mm_cvtepi32_epi64(__m128i __A) {
536  return (__m128i)vec_unpackh((__v4si)__A);
537 }
538 #endif
539 
540 extern __inline __m128i
541  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542  _mm_cvtepu8_epi16(__m128i __A) {
543  const __v16qu __zero = {0};
544 #ifdef __LITTLE_ENDIAN__
545  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
546 #else /* __BIG_ENDIAN__. */
547  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
548 #endif /* __BIG_ENDIAN__. */
549  return __A;
550 }
551 
552 extern __inline __m128i
553  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554  _mm_cvtepu8_epi32(__m128i __A) {
555  const __v16qu __zero = {0};
556 #ifdef __LITTLE_ENDIAN__
557  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
558  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
559 #else /* __BIG_ENDIAN__. */
560  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
561  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
562 #endif /* __BIG_ENDIAN__. */
563  return __A;
564 }
565 
566 extern __inline __m128i
567  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568  _mm_cvtepu8_epi64(__m128i __A) {
569  const __v16qu __zero = {0};
570 #ifdef __LITTLE_ENDIAN__
571  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
572  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
573  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
574 #else /* __BIG_ENDIAN__. */
575  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
576  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
577  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
578 #endif /* __BIG_ENDIAN__. */
579  return __A;
580 }
581 
582 extern __inline __m128i
583  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584  _mm_cvtepu16_epi32(__m128i __A) {
585  const __v8hu __zero = {0};
586 #ifdef __LITTLE_ENDIAN__
587  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
588 #else /* __BIG_ENDIAN__. */
589  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
590 #endif /* __BIG_ENDIAN__. */
591  return __A;
592 }
593 
594 extern __inline __m128i
595  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596  _mm_cvtepu16_epi64(__m128i __A) {
597  const __v8hu __zero = {0};
598 #ifdef __LITTLE_ENDIAN__
599  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
600  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
601 #else /* __BIG_ENDIAN__. */
602  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
603  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
604 #endif /* __BIG_ENDIAN__. */
605  return __A;
606 }
607 
608 extern __inline __m128i
609  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610  _mm_cvtepu32_epi64(__m128i __A) {
611  const __v4su __zero = {0};
612 #ifdef __LITTLE_ENDIAN__
613  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
614 #else /* __BIG_ENDIAN__. */
615  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
616 #endif /* __BIG_ENDIAN__. */
617  return __A;
618 }
619 
620 /* Return horizontal packed word minimum and its index in bits [15:0]
621  and bits [18:16] respectively. */
622 extern __inline __m128i
623  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624  _mm_minpos_epu16(__m128i __A) {
625  union __u {
626  __m128i __m;
627  __v8hu __uh;
628  };
629  union __u __u = {.__m = __A}, __r = {.__m = {0}};
630  unsigned short __ridx = 0;
631  unsigned short __rmin = __u.__uh[__ridx];
632  unsigned long __i;
633  for (__i = 1; __i < 8; __i++) {
634  if (__u.__uh[__i] < __rmin) {
635  __rmin = __u.__uh[__i];
636  __ridx = __i;
637  }
638  }
639  __r.__uh[0] = __rmin;
640  __r.__uh[1] = __ridx;
641  return __r.__m;
642 }
643 
644 extern __inline __m128i
645  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646  _mm_packus_epi32(__m128i __X, __m128i __Y) {
647  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
648 }
649 
650 #ifdef _ARCH_PWR8
651 extern __inline __m128i
652  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653  _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
654  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
655 }
656 #endif
657 
658 #else
659 #include_next <smmintrin.h>
660 #endif /* defined(__ppc64__) &&
661  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
662 
663 #endif /* SMMINTRIN_H_ */
_mm_blendv_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:436
_mm_extract_epi32
#define _mm_extract_epi32(X, N)
Extracts a 32-bit element from the 128-bit integer vector of [4 x i32], using the immediate value par...
Definition: smmintrin.h:1054
_mm_round_sd
#define _mm_round_sd(X, Y, M)
Copies the upper element of the first 128-bit vector operand to the corresponding upper element of th...
Definition: smmintrin.h:355
_mm_max_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:792
_mm_cvtepu8_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1356
vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
vec_packsu
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7832
_mm_max_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:720
_mm_min_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition: smmintrin.h:774
vec_all_eq
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:14771
_mm_max_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:684
_mm_blendv_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition: smmintrin.h:490
_mm_blend_epi16
#define _mm_blend_epi16(V1, V2, M)
Returns a 128-bit vector of [8 x i16] where the values are selected from either of the first or secon...
Definition: smmintrin.h:521
_MM_FROUND_TO_NEG_INF
#define _MM_FROUND_TO_NEG_INF
Definition: smmintrin.h:26
_mm_cvtepi8_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1261
vec_mul
static __inline__ vector signed char __ATTRS_o_ai vec_mul(vector signed char __a, vector signed char __b)
Definition: altivec.h:6193
_mm_cvtepu16_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1392
vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6717
_mm_cvtepu16_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1410
_mm_blendv_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:463
__Y
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
vec_sra
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10515
_MM_FROUND_TO_ZERO
#define _MM_FROUND_TO_ZERO
Definition: smmintrin.h:28
_mm_extract_ps
#define _mm_extract_ps(X, N)
Extracts a 32-bit integer from a 128-bit vector of [4 x float] and returns it, using the immediate va...
Definition: smmintrin.h:862
_mm_insert_epi32
#define _mm_insert_epi32(X, I, N)
Constructs a 128-bit vector of [4 x i32] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:954
_mm_round_ss
#define _mm_round_ss(X, Y, M)
Copies three upper elements of the first 128-bit vector operand to the corresponding three upper elem...
Definition: smmintrin.h:280
_mm_testz_si128
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition: smmintrin.h:1093
vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
_mm_cvtepi32_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1317
_mm_cvtepi8_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition: smmintrin.h:1241
vec_floor
static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a)
Definition: altivec.h:4014
_mm_cvtepu32_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition: smmintrin.h:1428
altivec.h
_mm_min_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition: smmintrin.h:702
__D
static __inline__ void short __D
Definition: immintrin.h:382
vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
_mm_cvtepi8_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1219
vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14706
_mm_packus_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit unsigned intege...
Definition: smmintrin.h:1456
_mm_extract_epi8
#define _mm_extract_epi8(X, N)
Extracts an 8-bit element from the 128-bit integer vector of [16 x i8], using the immediate value par...
Definition: smmintrin.h:1028
_mm_round_pd
#define _mm_round_pd(X, M)
Rounds each element of the 128-bit vector of [2 x double] to an integer value according to the roundi...
Definition: smmintrin.h:314
_mm_blend_ps
#define _mm_blend_ps(V1, V2, M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition: smmintrin.h:412
vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7950
_mm_cvtepu8_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition: smmintrin.h:1336
_mm_min_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:738
_mm_minpos_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition: smmintrin.h:1514
_mm_cvtepi16_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition: smmintrin.h:1281
vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4826
_mm_insert_epi8
#define _mm_insert_epi8(X, I, N)
Constructs a 128-bit vector of [16 x i8] by first making a copy of the 128-bit integer vector paramet...
Definition: smmintrin.h:922
_MM_FROUND_TO_NEAREST_INT
#define _MM_FROUND_TO_NEAREST_INT
Definition: smmintrin.h:25
_mm_testnzc_si128
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition: smmintrin.h:1128
_mm_mullo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition: smmintrin.h:539
_mm_cvtepu8_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition: smmintrin.h:1374
vec_unpackh
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12630
vec_mule
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6251
_mm_blend_pd
#define _mm_blend_pd(V1, V2, M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition: smmintrin.h:384
_mm_testc_si128
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition: smmintrin.h:1110
__attribute__
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
Definition: __clang_hip_libdevice_declares.h:311
vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5730
_MM_FROUND_NO_EXC
#define _MM_FROUND_NO_EXC
Definition: smmintrin.h:32
vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
_MM_FROUND_CUR_DIRECTION
#define _MM_FROUND_CUR_DIRECTION
Definition: smmintrin.h:29
_mm_extract_epi64
#define _mm_extract_epi64(X, N)
Extracts a 64-bit element from the 128-bit integer vector of [2 x i64], using the immediate value par...
Definition: smmintrin.h:1077
_mm_min_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition: smmintrin.h:666
_mm_max_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition: smmintrin.h:756
vec_ceil
static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a)
Definition: altivec.h:1659
_mm_mul_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition: smmintrin.h:558
_mm_cvtepi16_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition: smmintrin.h:1299
vec_trunc
static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a)
Definition: altivec.h:12585
_MM_FROUND_TO_POS_INF
#define _MM_FROUND_TO_POS_INF
Definition: smmintrin.h:27
_mm_round_ps
#define _mm_round_ps(X, M)
Rounds each element of the 128-bit vector of [4 x float] to an integer value according to the roundin...
Definition: smmintrin.h:239
vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8576
_mm_cmpgt_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition: smmintrin.h:2317
_mm_cmpeq_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition: smmintrin.h:1200
vec_reve
static __ATTRS_o_ai vector bool char vec_reve(vector bool char __a)
Definition: altivec.h:17497
vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5079