clang  14.0.0git
xmmintrin.h
Go to the documentation of this file.
1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18  VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications. */
31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
32 #endif
33 
34 #ifndef _XMMINTRIN_H_INCLUDED
35 #define _XMMINTRIN_H_INCLUDED
36 
37 #if defined(__linux__) && defined(__ppc64__)
38 
39 /* Define four value permute mask */
40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
41 
42 #include <altivec.h>
43 
44 /* Avoid collisions between altivec.h and strict adherence to C++ and
45  C11 standards. This should eventually be done inside altivec.h itself,
46  but only after testing a full distro build. */
47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48  (defined(__STDC_VERSION__) && \
49  __STDC_VERSION__ >= 201112L))
50 #undef vector
51 #undef pixel
52 #undef bool
53 #endif
54 
55 /* We need type definitions from the MMX header file. */
56 #include <mmintrin.h>
57 
58 /* Get _mm_malloc () and _mm_free (). */
59 #if __STDC_HOSTED__
60 #include <mm_malloc.h>
61 #endif
62 
63 /* The Intel API is flexible enough that we must allow aliasing with other
64  vector types, and their scalar components. */
65 typedef vector float __m128 __attribute__((__may_alias__));
66 
67 /* Unaligned version of the same type. */
68 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
69 
70 /* Internal data types for implementing the intrinsics. */
71 typedef vector float __v4sf;
72 
73 /* Create an undefined vector. */
74 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_undefined_ps (void)
76 {
77  __m128 __Y = __Y;
78  return __Y;
79 }
80 
81 /* Create a vector of zeros. */
82 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_setzero_ps (void)
84 {
85  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
86 }
87 
88 /* Load four SPFP values from P. The address must be 16-byte aligned. */
89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_load_ps (float const *__P)
91 {
92  return ((__m128)vec_ld(0, (__v4sf*)__P));
93 }
94 
95 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_loadu_ps (float const *__P)
98 {
99  return (vec_vsx_ld(0, __P));
100 }
101 
102 /* Load four SPFP values in reverse order. The address must be aligned. */
103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_loadr_ps (float const *__P)
105 {
106  __v4sf __tmp;
107  __m128 result;
108  static const __vector unsigned char permute_vector =
109  { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
110  0x17, 0x10, 0x11, 0x12, 0x13 };
111 
112  __tmp = vec_ld (0, (__v4sf *) __P);
113  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
114  return result;
115 }
116 
117 /* Create a vector with all four elements equal to F. */
118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm_set1_ps (float __F)
120 {
121  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
122 }
123 
124 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm_set_ps1 (float __F)
126 {
127  return _mm_set1_ps (__F);
128 }
129 
130 /* Create the vector [Z Y X W]. */
131 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
133 {
134  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
135 }
136 
137 /* Create the vector [W X Y Z]. */
138 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
140 {
141  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
142 }
143 
144 /* Store four SPFP values. The address must be 16-byte aligned. */
145 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 _mm_store_ps (float *__P, __m128 __A)
147 {
148  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
149 }
150 
151 /* Store four SPFP values. The address need not be 16-byte aligned. */
152 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_storeu_ps (float *__P, __m128 __A)
154 {
155  *(__m128_u *)__P = __A;
156 }
157 
158 /* Store four SPFP values in reverse order. The address must be aligned. */
159 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_storer_ps (float *__P, __m128 __A)
161 {
162  __v4sf __tmp;
163  static const __vector unsigned char permute_vector =
164  { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
165  0x17, 0x10, 0x11, 0x12, 0x13 };
166 
167  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
168 
169  _mm_store_ps (__P, __tmp);
170 }
171 
172 /* Store the lower SPFP value across four words. */
173 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174 _mm_store1_ps (float *__P, __m128 __A)
175 {
176  __v4sf __va = vec_splat((__v4sf)__A, 0);
177  _mm_store_ps (__P, __va);
178 }
179 
180 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181 _mm_store_ps1 (float *__P, __m128 __A)
182 {
183  _mm_store1_ps (__P, __A);
184 }
185 
186 /* Create a vector with element 0 as F and the rest zero. */
187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_set_ss (float __F)
189 {
190  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
191 }
192 
193 /* Sets the low SPFP value of A from the low value of B. */
194 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195 _mm_move_ss (__m128 __A, __m128 __B)
196 {
197  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
198 
199  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
200 }
201 
202 /* Create a vector with element 0 as *P and the rest zero. */
203 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204 _mm_load_ss (float const *__P)
205 {
206  return _mm_set_ss (*__P);
207 }
208 
209 /* Stores the lower SPFP value. */
210 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_store_ss (float *__P, __m128 __A)
212 {
213  *__P = ((__v4sf)__A)[0];
214 }
215 
216 /* Perform the respective operation on the lower SPFP (single-precision
217  floating-point) values of A and B; the upper three SPFP values are
218  passed through from A. */
219 
220 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_add_ss (__m128 __A, __m128 __B)
222 {
223 #ifdef _ARCH_PWR7
224  __m128 a, b, c;
225  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
226  /* PowerISA VSX does not allow partial (for just lower double)
227  results. So to insure we don't generate spurious exceptions
228  (from the upper double values) we splat the lower double
229  before we to the operation. */
230  a = vec_splat (__A, 0);
231  b = vec_splat (__B, 0);
232  c = a + b;
233  /* Then we merge the lower float result with the original upper
234  float elements from __A. */
235  return (vec_sel (__A, c, mask));
236 #else
237  __A[0] = __A[0] + __B[0];
238  return (__A);
239 #endif
240 }
241 
242 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243 _mm_sub_ss (__m128 __A, __m128 __B)
244 {
245 #ifdef _ARCH_PWR7
246  __m128 a, b, c;
247  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
248  /* PowerISA VSX does not allow partial (for just lower double)
249  results. So to insure we don't generate spurious exceptions
250  (from the upper double values) we splat the lower double
251  before we to the operation. */
252  a = vec_splat (__A, 0);
253  b = vec_splat (__B, 0);
254  c = a - b;
255  /* Then we merge the lower float result with the original upper
256  float elements from __A. */
257  return (vec_sel (__A, c, mask));
258 #else
259  __A[0] = __A[0] - __B[0];
260  return (__A);
261 #endif
262 }
263 
264 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_mul_ss (__m128 __A, __m128 __B)
266 {
267 #ifdef _ARCH_PWR7
268  __m128 a, b, c;
269  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
270  /* PowerISA VSX does not allow partial (for just lower double)
271  results. So to insure we don't generate spurious exceptions
272  (from the upper double values) we splat the lower double
273  before we to the operation. */
274  a = vec_splat (__A, 0);
275  b = vec_splat (__B, 0);
276  c = a * b;
277  /* Then we merge the lower float result with the original upper
278  float elements from __A. */
279  return (vec_sel (__A, c, mask));
280 #else
281  __A[0] = __A[0] * __B[0];
282  return (__A);
283 #endif
284 }
285 
286 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm_div_ss (__m128 __A, __m128 __B)
288 {
289 #ifdef _ARCH_PWR7
290  __m128 a, b, c;
291  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
292  /* PowerISA VSX does not allow partial (for just lower double)
293  results. So to insure we don't generate spurious exceptions
294  (from the upper double values) we splat the lower double
295  before we to the operation. */
296  a = vec_splat (__A, 0);
297  b = vec_splat (__B, 0);
298  c = a / b;
299  /* Then we merge the lower float result with the original upper
300  float elements from __A. */
301  return (vec_sel (__A, c, mask));
302 #else
303  __A[0] = __A[0] / __B[0];
304  return (__A);
305 #endif
306 }
307 
308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 _mm_sqrt_ss (__m128 __A)
310 {
311  __m128 a, c;
312  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
313  /* PowerISA VSX does not allow partial (for just lower double)
314  * results. So to insure we don't generate spurious exceptions
315  * (from the upper double values) we splat the lower double
316  * before we to the operation. */
317  a = vec_splat (__A, 0);
318  c = vec_sqrt (a);
319  /* Then we merge the lower float result with the original upper
320  * float elements from __A. */
321  return (vec_sel (__A, c, mask));
322 }
323 
324 /* Perform the respective operation on the four SPFP values in A and B. */
325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 _mm_add_ps (__m128 __A, __m128 __B)
327 {
328  return (__m128) ((__v4sf)__A + (__v4sf)__B);
329 }
330 
331 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 _mm_sub_ps (__m128 __A, __m128 __B)
333 {
334  return (__m128) ((__v4sf)__A - (__v4sf)__B);
335 }
336 
337 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _mm_mul_ps (__m128 __A, __m128 __B)
339 {
340  return (__m128) ((__v4sf)__A * (__v4sf)__B);
341 }
342 
343 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm_div_ps (__m128 __A, __m128 __B)
345 {
346  return (__m128) ((__v4sf)__A / (__v4sf)__B);
347 }
348 
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm_sqrt_ps (__m128 __A)
351 {
352  return (vec_sqrt ((__v4sf)__A));
353 }
354 
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_rcp_ps (__m128 __A)
357 {
358  return (vec_re ((__v4sf)__A));
359 }
360 
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm_rsqrt_ps (__m128 __A)
363 {
364  return (vec_rsqrte (__A));
365 }
366 
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 _mm_rcp_ss (__m128 __A)
369 {
370  __m128 a, c;
371  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
372  /* PowerISA VSX does not allow partial (for just lower double)
373  * results. So to insure we don't generate spurious exceptions
374  * (from the upper double values) we splat the lower double
375  * before we to the operation. */
376  a = vec_splat (__A, 0);
377  c = _mm_rcp_ps (a);
378  /* Then we merge the lower float result with the original upper
379  * float elements from __A. */
380  return (vec_sel (__A, c, mask));
381 }
382 
383 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_rsqrt_ss (__m128 __A)
385 {
386  __m128 a, c;
387  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
388  /* PowerISA VSX does not allow partial (for just lower double)
389  * results. So to insure we don't generate spurious exceptions
390  * (from the upper double values) we splat the lower double
391  * before we to the operation. */
392  a = vec_splat (__A, 0);
393  c = vec_rsqrte (a);
394  /* Then we merge the lower float result with the original upper
395  * float elements from __A. */
396  return (vec_sel (__A, c, mask));
397 }
398 
399 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_min_ss (__m128 __A, __m128 __B)
401 {
402  __v4sf a, b, c;
403  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
404  /* PowerISA VSX does not allow partial (for just lower float)
405  * results. So to insure we don't generate spurious exceptions
406  * (from the upper float values) we splat the lower float
407  * before we to the operation. */
408  a = vec_splat ((__v4sf)__A, 0);
409  b = vec_splat ((__v4sf)__B, 0);
410  c = vec_min (a, b);
411  /* Then we merge the lower float result with the original upper
412  * float elements from __A. */
413  return (vec_sel ((__v4sf)__A, c, mask));
414 }
415 
416 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417 _mm_max_ss (__m128 __A, __m128 __B)
418 {
419  __v4sf a, b, c;
420  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
421  /* PowerISA VSX does not allow partial (for just lower float)
422  * results. So to insure we don't generate spurious exceptions
423  * (from the upper float values) we splat the lower float
424  * before we to the operation. */
425  a = vec_splat (__A, 0);
426  b = vec_splat (__B, 0);
427  c = vec_max (a, b);
428  /* Then we merge the lower float result with the original upper
429  * float elements from __A. */
430  return (vec_sel ((__v4sf)__A, c, mask));
431 }
432 
433 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434 _mm_min_ps (__m128 __A, __m128 __B)
435 {
436  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
437  return vec_sel (__B, __A, m);
438 }
439 
440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm_max_ps (__m128 __A, __m128 __B)
442 {
443  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
444  return vec_sel (__B, __A, m);
445 }
446 
447 /* Perform logical bit-wise operations on 128-bit values. */
448 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_and_ps (__m128 __A, __m128 __B)
450 {
451  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
452 // return __builtin_ia32_andps (__A, __B);
453 }
454 
455 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm_andnot_ps (__m128 __A, __m128 __B)
457 {
458  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
459 }
460 
461 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm_or_ps (__m128 __A, __m128 __B)
463 {
464  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
465 }
466 
467 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm_xor_ps (__m128 __A, __m128 __B)
469 {
470  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
471 }
472 
473 /* Perform a comparison on the four SPFP values of A and B. For each
474  element, if the comparison is true, place a mask of all ones in the
475  result, otherwise a mask of zeros. */
476 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm_cmpeq_ps (__m128 __A, __m128 __B)
478 {
479  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
480 }
481 
482 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm_cmplt_ps (__m128 __A, __m128 __B)
484 {
485  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
486 }
487 
488 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_cmple_ps (__m128 __A, __m128 __B)
490 {
491  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
492 }
493 
494 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_cmpgt_ps (__m128 __A, __m128 __B)
496 {
497  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
498 }
499 
500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 _mm_cmpge_ps (__m128 __A, __m128 __B)
502 {
503  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
504 }
505 
506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm_cmpneq_ps (__m128 __A, __m128 __B)
508 {
509  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
510  return ((__m128)vec_nor (temp, temp));
511 }
512 
513 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
515 {
516  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
517 }
518 
519 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 _mm_cmpnle_ps (__m128 __A, __m128 __B)
521 {
522  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
523 }
524 
525 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 _mm_cmpngt_ps (__m128 __A, __m128 __B)
527 {
528  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
529 }
530 
531 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 _mm_cmpnge_ps (__m128 __A, __m128 __B)
533 {
534  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
535 }
536 
537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpord_ps (__m128 __A, __m128 __B)
539 {
540  __vector unsigned int a, b;
541  __vector unsigned int c, d;
542  static const __vector unsigned int float_exp_mask =
543  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
544 
545  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
546  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
547  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
548  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
549  return ((__m128 ) vec_and (c, d));
550 }
551 
552 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553 _mm_cmpunord_ps (__m128 __A, __m128 __B)
554 {
555  __vector unsigned int a, b;
556  __vector unsigned int c, d;
557  static const __vector unsigned int float_exp_mask =
558  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
559 
560  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
561  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
562  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
563  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
564  return ((__m128 ) vec_or (c, d));
565 }
566 
567 /* Perform a comparison on the lower SPFP values of A and B. If the
568  comparison is true, place a mask of all ones in the result, otherwise a
569  mask of zeros. The upper three SPFP values are passed through from A. */
570 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
571 _mm_cmpeq_ss (__m128 __A, __m128 __B)
572 {
573  static const __vector unsigned int mask =
574  { 0xffffffff, 0, 0, 0 };
575  __v4sf a, b, c;
576  /* PowerISA VMX does not allow partial (for just element 0)
577  * results. So to insure we don't generate spurious exceptions
578  * (from the upper elements) we splat the lower float
579  * before we to the operation. */
580  a = vec_splat ((__v4sf) __A, 0);
581  b = vec_splat ((__v4sf) __B, 0);
582  c = (__v4sf) vec_cmpeq(a, b);
583  /* Then we merge the lower float result with the original upper
584  * float elements from __A. */
585  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
586 }
587 
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_cmplt_ss (__m128 __A, __m128 __B)
590 {
591  static const __vector unsigned int mask =
592  { 0xffffffff, 0, 0, 0 };
593  __v4sf a, b, c;
594  /* PowerISA VMX does not allow partial (for just element 0)
595  * results. So to insure we don't generate spurious exceptions
596  * (from the upper elements) we splat the lower float
597  * before we to the operation. */
598  a = vec_splat ((__v4sf) __A, 0);
599  b = vec_splat ((__v4sf) __B, 0);
600  c = (__v4sf) vec_cmplt(a, b);
601  /* Then we merge the lower float result with the original upper
602  * float elements from __A. */
603  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
604 }
605 
606 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_cmple_ss (__m128 __A, __m128 __B)
608 {
609  static const __vector unsigned int mask =
610  { 0xffffffff, 0, 0, 0 };
611  __v4sf a, b, c;
612  /* PowerISA VMX does not allow partial (for just element 0)
613  * results. So to insure we don't generate spurious exceptions
614  * (from the upper elements) we splat the lower float
615  * before we to the operation. */
616  a = vec_splat ((__v4sf) __A, 0);
617  b = vec_splat ((__v4sf) __B, 0);
618  c = (__v4sf) vec_cmple(a, b);
619  /* Then we merge the lower float result with the original upper
620  * float elements from __A. */
621  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
622 }
623 
624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_cmpgt_ss (__m128 __A, __m128 __B)
626 {
627  static const __vector unsigned int mask =
628  { 0xffffffff, 0, 0, 0 };
629  __v4sf a, b, c;
630  /* PowerISA VMX does not allow partial (for just element 0)
631  * results. So to insure we don't generate spurious exceptions
632  * (from the upper elements) we splat the lower float
633  * before we to the operation. */
634  a = vec_splat ((__v4sf) __A, 0);
635  b = vec_splat ((__v4sf) __B, 0);
636  c = (__v4sf) vec_cmpgt(a, b);
637  /* Then we merge the lower float result with the original upper
638  * float elements from __A. */
639  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
640 }
641 
642 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_cmpge_ss (__m128 __A, __m128 __B)
644 {
645  static const __vector unsigned int mask =
646  { 0xffffffff, 0, 0, 0 };
647  __v4sf a, b, c;
648  /* PowerISA VMX does not allow partial (for just element 0)
649  * results. So to insure we don't generate spurious exceptions
650  * (from the upper elements) we splat the lower float
651  * before we to the operation. */
652  a = vec_splat ((__v4sf) __A, 0);
653  b = vec_splat ((__v4sf) __B, 0);
654  c = (__v4sf) vec_cmpge(a, b);
655  /* Then we merge the lower float result with the original upper
656  * float elements from __A. */
657  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
658 }
659 
660 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm_cmpneq_ss (__m128 __A, __m128 __B)
662 {
663  static const __vector unsigned int mask =
664  { 0xffffffff, 0, 0, 0 };
665  __v4sf a, b, c;
666  /* PowerISA VMX does not allow partial (for just element 0)
667  * results. So to insure we don't generate spurious exceptions
668  * (from the upper elements) we splat the lower float
669  * before we to the operation. */
670  a = vec_splat ((__v4sf) __A, 0);
671  b = vec_splat ((__v4sf) __B, 0);
672  c = (__v4sf) vec_cmpeq(a, b);
673  c = vec_nor (c, c);
674  /* Then we merge the lower float result with the original upper
675  * float elements from __A. */
676  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
677 }
678 
679 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
681 {
682  static const __vector unsigned int mask =
683  { 0xffffffff, 0, 0, 0 };
684  __v4sf a, b, c;
685  /* PowerISA VMX does not allow partial (for just element 0)
686  * results. So to insure we don't generate spurious exceptions
687  * (from the upper elements) we splat the lower float
688  * before we to the operation. */
689  a = vec_splat ((__v4sf) __A, 0);
690  b = vec_splat ((__v4sf) __B, 0);
691  c = (__v4sf) vec_cmpge(a, b);
692  /* Then we merge the lower float result with the original upper
693  * float elements from __A. */
694  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
695 }
696 
697 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_cmpnle_ss (__m128 __A, __m128 __B)
699 {
700  static const __vector unsigned int mask =
701  { 0xffffffff, 0, 0, 0 };
702  __v4sf a, b, c;
703  /* PowerISA VMX does not allow partial (for just element 0)
704  * results. So to insure we don't generate spurious exceptions
705  * (from the upper elements) we splat the lower float
706  * before we to the operation. */
707  a = vec_splat ((__v4sf) __A, 0);
708  b = vec_splat ((__v4sf) __B, 0);
709  c = (__v4sf) vec_cmpgt(a, b);
710  /* Then we merge the lower float result with the original upper
711  * float elements from __A. */
712  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
713 }
714 
715 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 _mm_cmpngt_ss (__m128 __A, __m128 __B)
717 {
718  static const __vector unsigned int mask =
719  { 0xffffffff, 0, 0, 0 };
720  __v4sf a, b, c;
721  /* PowerISA VMX does not allow partial (for just element 0)
722  * results. So to insure we don't generate spurious exceptions
723  * (from the upper elements) we splat the lower float
724  * before we to the operation. */
725  a = vec_splat ((__v4sf) __A, 0);
726  b = vec_splat ((__v4sf) __B, 0);
727  c = (__v4sf) vec_cmple(a, b);
728  /* Then we merge the lower float result with the original upper
729  * float elements from __A. */
730  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
731 }
732 
733 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_cmpnge_ss (__m128 __A, __m128 __B)
735 {
736  static const __vector unsigned int mask =
737  { 0xffffffff, 0, 0, 0 };
738  __v4sf a, b, c;
739  /* PowerISA VMX does not allow partial (for just element 0)
740  * results. So to insure we don't generate spurious exceptions
741  * (from the upper elements) we splat the lower float
742  * before we do the operation. */
743  a = vec_splat ((__v4sf) __A, 0);
744  b = vec_splat ((__v4sf) __B, 0);
745  c = (__v4sf) vec_cmplt(a, b);
746  /* Then we merge the lower float result with the original upper
747  * float elements from __A. */
748  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
749 }
750 
751 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_cmpord_ss (__m128 __A, __m128 __B)
753 {
754  __vector unsigned int a, b;
755  __vector unsigned int c, d;
756  static const __vector unsigned int float_exp_mask =
757  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
758  static const __vector unsigned int mask =
759  { 0xffffffff, 0, 0, 0 };
760 
761  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
762  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
763  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
764  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
765  c = vec_and (c, d);
766  /* Then we merge the lower float result with the original upper
767  * float elements from __A. */
768  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
769 }
770 
771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _mm_cmpunord_ss (__m128 __A, __m128 __B)
773 {
774  __vector unsigned int a, b;
775  __vector unsigned int c, d;
776  static const __vector unsigned int float_exp_mask =
777  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
778  static const __vector unsigned int mask =
779  { 0xffffffff, 0, 0, 0 };
780 
781  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
782  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
783  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
784  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
785  c = vec_or (c, d);
786  /* Then we merge the lower float result with the original upper
787  * float elements from __A. */
788  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
789 }
790 
791 /* Compare the lower SPFP values of A and B and return 1 if true
792  and 0 if false. */
793 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_comieq_ss (__m128 __A, __m128 __B)
795 {
796  return (__A[0] == __B[0]);
797 }
798 
799 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_comilt_ss (__m128 __A, __m128 __B)
801 {
802  return (__A[0] < __B[0]);
803 }
804 
805 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_comile_ss (__m128 __A, __m128 __B)
807 {
808  return (__A[0] <= __B[0]);
809 }
810 
811 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812 _mm_comigt_ss (__m128 __A, __m128 __B)
813 {
814  return (__A[0] > __B[0]);
815 }
816 
817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_comige_ss (__m128 __A, __m128 __B)
819 {
820  return (__A[0] >= __B[0]);
821 }
822 
823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_comineq_ss (__m128 __A, __m128 __B)
825 {
826  return (__A[0] != __B[0]);
827 }
828 
829 /* FIXME
830  * The __mm_ucomi??_ss implementations below are exactly the same as
831  * __mm_comi??_ss because GCC for PowerPC only generates unordered
832  * compares (scalar and vector).
833  * Technically __mm_comieq_ss et al should be using the ordered
834  * compare and signal for QNaNs.
835  * The __mm_ucomieq_sd et all should be OK, as is.
836  */
837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_ucomieq_ss (__m128 __A, __m128 __B)
839 {
840  return (__A[0] == __B[0]);
841 }
842 
843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 _mm_ucomilt_ss (__m128 __A, __m128 __B)
845 {
846  return (__A[0] < __B[0]);
847 }
848 
849 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850 _mm_ucomile_ss (__m128 __A, __m128 __B)
851 {
852  return (__A[0] <= __B[0]);
853 }
854 
855 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
856 _mm_ucomigt_ss (__m128 __A, __m128 __B)
857 {
858  return (__A[0] > __B[0]);
859 }
860 
861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862 _mm_ucomige_ss (__m128 __A, __m128 __B)
863 {
864  return (__A[0] >= __B[0]);
865 }
866 
867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868 _mm_ucomineq_ss (__m128 __A, __m128 __B)
869 {
870  return (__A[0] != __B[0]);
871 }
872 
873 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874 _mm_cvtss_f32 (__m128 __A)
875 {
876  return ((__v4sf)__A)[0];
877 }
878 
879 /* Convert the lower SPFP value to a 32-bit integer according to the current
880  rounding mode. */
881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882 _mm_cvtss_si32 (__m128 __A)
883 {
884  __m64 res = 0;
885 #ifdef _ARCH_PWR8
886  double dtmp;
887  __asm__(
888 #ifdef __LITTLE_ENDIAN__
889  "xxsldwi %x0,%x0,%x0,3;\n"
890 #endif
891  "xscvspdp %x2,%x0;\n"
892  "fctiw %2,%2;\n"
893  "mfvsrd %1,%x2;\n"
894  : "+wa" (__A),
895  "=r" (res),
896  "=f" (dtmp)
897  : );
898 #else
899  res = __builtin_rint(__A[0]);
900 #endif
901  return (res);
902 }
903 
904 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905 _mm_cvt_ss2si (__m128 __A)
906 {
907  return _mm_cvtss_si32 (__A);
908 }
909 
910 /* Convert the lower SPFP value to a 32-bit integer according to the
911  current rounding mode. */
912 
913 /* Intel intrinsic. */
914 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 _mm_cvtss_si64 (__m128 __A)
916 {
917  __m64 res = 0;
918 #ifdef _ARCH_PWR8
919  double dtmp;
920  __asm__(
921 #ifdef __LITTLE_ENDIAN__
922  "xxsldwi %x0,%x0,%x0,3;\n"
923 #endif
924  "xscvspdp %x2,%x0;\n"
925  "fctid %2,%2;\n"
926  "mfvsrd %1,%x2;\n"
927  : "+wa" (__A),
928  "=r" (res),
929  "=f" (dtmp)
930  : );
931 #else
932  res = __builtin_llrint(__A[0]);
933 #endif
934  return (res);
935 }
936 
937 /* Microsoft intrinsic. */
938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_cvtss_si64x (__m128 __A)
940 {
941  return _mm_cvtss_si64 ((__v4sf) __A);
942 }
943 
944 /* Constants for use with _mm_prefetch. */
945 enum _mm_hint
946 {
947  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
948  _MM_HINT_ET0 = 7,
949  _MM_HINT_ET1 = 6,
950  _MM_HINT_T0 = 3,
951  _MM_HINT_T1 = 2,
952  _MM_HINT_T2 = 1,
953  _MM_HINT_NTA = 0
954 };
955 
956 /* Loads one cache line from address P to a location "closer" to the
957  processor. The selector I specifies the type of prefetch operation. */
958 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959 _mm_prefetch (const void *__P, enum _mm_hint __I)
960 {
961  /* Current PowerPC will ignores the hint parameters. */
962  __builtin_prefetch (__P);
963 }
964 
965 /* Convert the two lower SPFP values to 32-bit integers according to the
966  current rounding mode. Return the integers in packed form. */
967 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
968 _mm_cvtps_pi32 (__m128 __A)
969 {
970  /* Splat two lower SPFP values to both halves. */
971  __v4sf temp, rounded;
972  __vector unsigned long long result;
973 
974  /* Splat two lower SPFP values to both halves. */
975  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
976  rounded = vec_rint(temp);
977  result = (__vector unsigned long long) vec_cts (rounded, 0);
978 
979  return (__m64) ((__vector long long) result)[0];
980 }
981 
982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_cvt_ps2pi (__m128 __A)
984 {
985  return _mm_cvtps_pi32 (__A);
986 }
987 
988 /* Truncate the lower SPFP value to a 32-bit integer. */
989 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_cvttss_si32 (__m128 __A)
991 {
992  /* Extract the lower float element. */
993  float temp = __A[0];
994  /* truncate to 32-bit integer and return. */
995  return temp;
996 }
997 
998 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm_cvtt_ss2si (__m128 __A)
1000 {
1001  return _mm_cvttss_si32 (__A);
1002 }
1003 
1004 /* Intel intrinsic. */
1005 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm_cvttss_si64 (__m128 __A)
1007 {
1008  /* Extract the lower float element. */
1009  float temp = __A[0];
1010  /* truncate to 32-bit integer and return. */
1011  return temp;
1012 }
1013 
1014 /* Microsoft intrinsic. */
1015 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016 _mm_cvttss_si64x (__m128 __A)
1017 {
1018  /* Extract the lower float element. */
1019  float temp = __A[0];
1020  /* truncate to 32-bit integer and return. */
1021  return temp;
1022 }
1023 
1024 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1025  integers in packed form. */
1026 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm_cvttps_pi32 (__m128 __A)
1028 {
1029  __v4sf temp;
1030  __vector unsigned long long result;
1031 
1032  /* Splat two lower SPFP values to both halves. */
1033  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1034  result = (__vector unsigned long long) vec_cts (temp, 0);
1035 
1036  return (__m64) ((__vector long long) result)[0];
1037 }
1038 
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvtt_ps2pi (__m128 __A)
1041 {
1042  return _mm_cvttps_pi32 (__A);
1043 }
1044 
1045 /* Convert B to a SPFP value and insert it as element zero in A. */
1046 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_cvtsi32_ss (__m128 __A, int __B)
1048 {
1049  float temp = __B;
1050  __A[0] = temp;
1051 
1052  return __A;
1053 }
1054 
1055 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvt_si2ss (__m128 __A, int __B)
1057 {
1058  return _mm_cvtsi32_ss (__A, __B);
1059 }
1060 
1061 /* Convert B to a SPFP value and insert it as element zero in A. */
1062 /* Intel intrinsic. */
1063 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm_cvtsi64_ss (__m128 __A, long long __B)
1065 {
1066  float temp = __B;
1067  __A[0] = temp;
1068 
1069  return __A;
1070 }
1071 
1072 /* Microsoft intrinsic. */
1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1075 {
1076  return _mm_cvtsi64_ss (__A, __B);
1077 }
1078 
1079 /* Convert the two 32-bit values in B to SPFP form and insert them
1080  as the two lower elements in A. */
1081 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1083 {
1084  __vector signed int vm1;
1085  __vector float vf1;
1086 
1087  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1088  vf1 = (__vector float) vec_ctf (vm1, 0);
1089 
1090  return ((__m128) (__vector unsigned long long)
1091  { ((__vector unsigned long long)vf1) [0],
1092  ((__vector unsigned long long)__A) [1]});
1093 }
1094 
1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1097 {
1098  return _mm_cvtpi32_ps (__A, __B);
1099 }
1100 
1101 /* Convert the four signed 16-bit values in A to SPFP form. */
1102 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103 _mm_cvtpi16_ps (__m64 __A)
1104 {
1105  __vector signed short vs8;
1106  __vector signed int vi4;
1107  __vector float vf1;
1108 
1109  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1110  vi4 = vec_vupklsh (vs8);
1111  vf1 = (__vector float) vec_ctf (vi4, 0);
1112 
1113  return (__m128) vf1;
1114 }
1115 
1116 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_cvtpu16_ps (__m64 __A)
1119 {
1120  const __vector unsigned short zero =
1121  { 0, 0, 0, 0, 0, 0, 0, 0 };
1122  __vector unsigned short vs8;
1123  __vector unsigned int vi4;
1124  __vector float vf1;
1125 
1126  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1127  vi4 = (__vector unsigned int) vec_mergel
1128 #ifdef __LITTLE_ENDIAN__
1129  (vs8, zero);
1130 #else
1131  (zero, vs8);
1132 #endif
1133  vf1 = (__vector float) vec_ctf (vi4, 0);
1134 
1135  return (__m128) vf1;
1136 }
1137 
1138 /* Convert the low four signed 8-bit values in A to SPFP form. */
1139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm_cvtpi8_ps (__m64 __A)
1141 {
1142  __vector signed char vc16;
1143  __vector signed short vs8;
1144  __vector signed int vi4;
1145  __vector float vf1;
1146 
1147  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1148  vs8 = vec_vupkhsb (vc16);
1149  vi4 = vec_vupkhsh (vs8);
1150  vf1 = (__vector float) vec_ctf (vi4, 0);
1151 
1152  return (__m128) vf1;
1153 }
1154 
1155 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1156 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157 
1158 _mm_cvtpu8_ps (__m64 __A)
1159 {
1160  const __vector unsigned char zero =
1161  { 0, 0, 0, 0, 0, 0, 0, 0 };
1162  __vector unsigned char vc16;
1163  __vector unsigned short vs8;
1164  __vector unsigned int vi4;
1165  __vector float vf1;
1166 
1167  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1168 #ifdef __LITTLE_ENDIAN__
1169  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1170  vi4 = (__vector unsigned int) vec_mergeh (vs8,
1171  (__vector unsigned short) zero);
1172 #else
1173  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1174  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1175  vs8);
1176 #endif
1177  vf1 = (__vector float) vec_ctf (vi4, 0);
1178 
1179  return (__m128) vf1;
1180 }
1181 
1182 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1183 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1185 {
1186  __vector signed int vi4;
1187  __vector float vf4;
1188 
1189  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1190  vf4 = (__vector float) vec_ctf (vi4, 0);
1191  return (__m128) vf4;
1192 }
1193 
1194 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196 _mm_cvtps_pi16 (__m128 __A)
1197 {
1198  __v4sf rounded;
1199  __vector signed int temp;
1200  __vector unsigned long long result;
1201 
1202  rounded = vec_rint(__A);
1203  temp = vec_cts (rounded, 0);
1204  result = (__vector unsigned long long) vec_pack (temp, temp);
1205 
1206  return (__m64) ((__vector long long) result)[0];
1207 }
1208 
1209 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1211 _mm_cvtps_pi8 (__m128 __A)
1212 {
1213  __v4sf rounded;
1214  __vector signed int tmp_i;
1215  static const __vector signed int zero = {0, 0, 0, 0};
1216  __vector signed short tmp_s;
1217  __vector signed char res_v;
1218 
1219  rounded = vec_rint(__A);
1220  tmp_i = vec_cts (rounded, 0);
1221  tmp_s = vec_pack (tmp_i, zero);
1222  res_v = vec_pack (tmp_s, tmp_s);
1223  return (__m64) ((__vector long long) res_v)[0];
1224 }
1225 
1226 /* Selects four specific SPFP values from A and B based on MASK. */
1227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 
1229 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1230 {
1231  unsigned long element_selector_10 = __mask & 0x03;
1232  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1233  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1234  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1235  static const unsigned int permute_selectors[4] =
1236  {
1237 #ifdef __LITTLE_ENDIAN__
1238  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1239 #else
1240  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1241 #endif
1242  };
1243  __vector unsigned int t;
1244 
1245  t[0] = permute_selectors[element_selector_10];
1246  t[1] = permute_selectors[element_selector_32];
1247  t[2] = permute_selectors[element_selector_54] + 0x10101010;
1248  t[3] = permute_selectors[element_selector_76] + 0x10101010;
1249  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1250 }
1251 
1252 /* Selects and interleaves the upper two SPFP values from A and B. */
1253 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1255 {
1256  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1257 }
1258 
1259 /* Selects and interleaves the lower two SPFP values from A and B. */
1260 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1262 {
1263  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1264 }
1265 
1266 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1267  the lower two values are passed through from A. */
1268 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1270 {
1271  __vector unsigned long long __a = (__vector unsigned long long)__A;
1272  __vector unsigned long long __p = vec_splats(*__P);
1273  __a [1] = __p [1];
1274 
1275  return (__m128)__a;
1276 }
1277 
1278 /* Stores the upper two SPFP values of A into P. */
1279 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_storeh_pi (__m64 *__P, __m128 __A)
1281 {
1282  __vector unsigned long long __a = (__vector unsigned long long) __A;
1283 
1284  *__P = __a[1];
1285 }
1286 
1287 /* Moves the upper two values of B into the lower two values of A. */
1288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_movehl_ps (__m128 __A, __m128 __B)
1290 {
1291  return (__m128) vec_mergel ((__vector unsigned long long)__B,
1292  (__vector unsigned long long)__A);
1293 }
1294 
1295 /* Moves the lower two values of B into the upper two values of A. */
1296 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_movelh_ps (__m128 __A, __m128 __B)
1298 {
1299  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1300  (__vector unsigned long long)__B);
1301 }
1302 
1303 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1304  the upper two values are passed through from A. */
1305 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1307 {
1308  __vector unsigned long long __a = (__vector unsigned long long)__A;
1309  __vector unsigned long long __p = vec_splats(*__P);
1310  __a [0] = __p [0];
1311 
1312  return (__m128)__a;
1313 }
1314 
1315 /* Stores the lower two SPFP values of A into P. */
1316 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_storel_pi (__m64 *__P, __m128 __A)
1318 {
1319  __vector unsigned long long __a = (__vector unsigned long long) __A;
1320 
1321  *__P = __a[0];
1322 }
1323 
1324 #ifdef _ARCH_PWR8
1325 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1326 
1327 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1328 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_movemask_ps (__m128 __A)
1330 {
1331  __vector unsigned long long result;
1332  static const __vector unsigned int perm_mask =
1333  {
1334 #ifdef __LITTLE_ENDIAN__
1335  0x00204060, 0x80808080, 0x80808080, 0x80808080
1336 #else
1337  0x80808080, 0x80808080, 0x80808080, 0x00204060
1338 #endif
1339  };
1340 
1341  result = ((__vector unsigned long long)
1342  vec_vbpermq ((__vector unsigned char) __A,
1343  (__vector unsigned char) perm_mask));
1344 
1345 #ifdef __LITTLE_ENDIAN__
1346  return result[1];
1347 #else
1348  return result[0];
1349 #endif
1350 }
1351 #endif /* _ARCH_PWR8 */
1352 
1353 /* Create a vector with all four elements equal to *P. */
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_load1_ps (float const *__P)
1356 {
1357  return _mm_set1_ps (*__P);
1358 }
1359 
1360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_load_ps1 (float const *__P)
1362 {
1363  return _mm_load1_ps (__P);
1364 }
1365 
1366 /* Extracts one of the four words of A. The selector N must be immediate. */
1367 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368 _mm_extract_pi16 (__m64 const __A, int const __N)
1369 {
1370  unsigned int shiftr = __N & 3;
1371 #ifdef __BIG_ENDIAN__
1372  shiftr = 3 - shiftr;
1373 #endif
1374 
1375  return ((__A >> (shiftr * 16)) & 0xffff);
1376 }
1377 
1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _m_pextrw (__m64 const __A, int const __N)
1380 {
1381  return _mm_extract_pi16 (__A, __N);
1382 }
1383 
1384 /* Inserts word D into one of four words of A. The selector N must be
1385  immediate. */
1386 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1388 {
1389  const int shiftl = (__N & 3) * 16;
1390  const __m64 shiftD = (const __m64) __D << shiftl;
1391  const __m64 mask = 0xffffUL << shiftl;
1392  __m64 result = (__A & (~mask)) | (shiftD & mask);
1393 
1394  return (result);
1395 }
1396 
1397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1399 {
1400  return _mm_insert_pi16 (__A, __D, __N);
1401 }
1402 
1403 /* Compute the element-wise maximum of signed 16-bit values. */
1404 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 
1406 _mm_max_pi16 (__m64 __A, __m64 __B)
1407 {
1408 #if _ARCH_PWR8
1409  __vector signed short a, b, r;
1410  __vector __bool short c;
1411 
1412  a = (__vector signed short)vec_splats (__A);
1413  b = (__vector signed short)vec_splats (__B);
1414  c = (__vector __bool short)vec_cmpgt (a, b);
1415  r = vec_sel (b, a, c);
1416  return (__m64) ((__vector long long) r)[0];
1417 #else
1418  __m64_union m1, m2, res;
1419 
1420  m1.as_m64 = __A;
1421  m2.as_m64 = __B;
1422 
1423  res.as_short[0] =
1424  (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1425  res.as_short[1] =
1426  (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1427  res.as_short[2] =
1428  (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1429  res.as_short[3] =
1430  (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1431 
1432  return (__m64) res.as_m64;
1433 #endif
1434 }
1435 
1436 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1437 _m_pmaxsw (__m64 __A, __m64 __B)
1438 {
1439  return _mm_max_pi16 (__A, __B);
1440 }
1441 
1442 /* Compute the element-wise maximum of unsigned 8-bit values. */
1443 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1444 _mm_max_pu8 (__m64 __A, __m64 __B)
1445 {
1446 #if _ARCH_PWR8
1447  __vector unsigned char a, b, r;
1448  __vector __bool char c;
1449 
1450  a = (__vector unsigned char)vec_splats (__A);
1451  b = (__vector unsigned char)vec_splats (__B);
1452  c = (__vector __bool char)vec_cmpgt (a, b);
1453  r = vec_sel (b, a, c);
1454  return (__m64) ((__vector long long) r)[0];
1455 #else
1456  __m64_union m1, m2, res;
1457  long i;
1458 
1459  m1.as_m64 = __A;
1460  m2.as_m64 = __B;
1461 
1462 
1463  for (i = 0; i < 8; i++)
1464  res.as_char[i] =
1465  ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1466  m1.as_char[i] : m2.as_char[i];
1467 
1468  return (__m64) res.as_m64;
1469 #endif
1470 }
1471 
1472 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1473 _m_pmaxub (__m64 __A, __m64 __B)
1474 {
1475  return _mm_max_pu8 (__A, __B);
1476 }
1477 
1478 /* Compute the element-wise minimum of signed 16-bit values. */
1479 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1480 _mm_min_pi16 (__m64 __A, __m64 __B)
1481 {
1482 #if _ARCH_PWR8
1483  __vector signed short a, b, r;
1484  __vector __bool short c;
1485 
1486  a = (__vector signed short)vec_splats (__A);
1487  b = (__vector signed short)vec_splats (__B);
1488  c = (__vector __bool short)vec_cmplt (a, b);
1489  r = vec_sel (b, a, c);
1490  return (__m64) ((__vector long long) r)[0];
1491 #else
1492  __m64_union m1, m2, res;
1493 
1494  m1.as_m64 = __A;
1495  m2.as_m64 = __B;
1496 
1497  res.as_short[0] =
1498  (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1499  res.as_short[1] =
1500  (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1501  res.as_short[2] =
1502  (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1503  res.as_short[3] =
1504  (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1505 
1506  return (__m64) res.as_m64;
1507 #endif
1508 }
1509 
1510 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1511 _m_pminsw (__m64 __A, __m64 __B)
1512 {
1513  return _mm_min_pi16 (__A, __B);
1514 }
1515 
1516 /* Compute the element-wise minimum of unsigned 8-bit values. */
1517 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518 _mm_min_pu8 (__m64 __A, __m64 __B)
1519 {
1520 #if _ARCH_PWR8
1521  __vector unsigned char a, b, r;
1522  __vector __bool char c;
1523 
1524  a = (__vector unsigned char)vec_splats (__A);
1525  b = (__vector unsigned char)vec_splats (__B);
1526  c = (__vector __bool char)vec_cmplt (a, b);
1527  r = vec_sel (b, a, c);
1528  return (__m64) ((__vector long long) r)[0];
1529 #else
1530  __m64_union m1, m2, res;
1531  long i;
1532 
1533  m1.as_m64 = __A;
1534  m2.as_m64 = __B;
1535 
1536 
1537  for (i = 0; i < 8; i++)
1538  res.as_char[i] =
1539  ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1540  m1.as_char[i] : m2.as_char[i];
1541 
1542  return (__m64) res.as_m64;
1543 #endif
1544 }
1545 
1546 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547 _m_pminub (__m64 __A, __m64 __B)
1548 {
1549  return _mm_min_pu8 (__A, __B);
1550 }
1551 
1552 /* Create an 8-bit mask of the signs of 8-bit values. */
1553 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554 _mm_movemask_pi8 (__m64 __A)
1555 {
1556  unsigned long long p =
1557 #ifdef __LITTLE_ENDIAN__
1558  0x0008101820283038UL; // permute control for sign bits
1559 #else
1560  0x3830282018100800UL; // permute control for sign bits
1561 #endif
1562  return __builtin_bpermd (p, __A);
1563 }
1564 
1565 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1566 _m_pmovmskb (__m64 __A)
1567 {
1568  return _mm_movemask_pi8 (__A);
1569 }
1570 
1571 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1572  in B and produce the high 16 bits of the 32-bit results. */
1573 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1575 {
1576  __vector unsigned short a, b;
1577  __vector unsigned short c;
1578  __vector unsigned int w0, w1;
1579  __vector unsigned char xform1 = {
1580 #ifdef __LITTLE_ENDIAN__
1581  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1582  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1583 #else
1584  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1585  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1586 #endif
1587  };
1588 
1589  a = (__vector unsigned short)vec_splats (__A);
1590  b = (__vector unsigned short)vec_splats (__B);
1591 
1592  w0 = vec_vmuleuh (a, b);
1593  w1 = vec_vmulouh (a, b);
1594  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1595 
1596  return (__m64) ((__vector long long) c)[0];
1597 }
1598 
1599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600 _m_pmulhuw (__m64 __A, __m64 __B)
1601 {
1602  return _mm_mulhi_pu16 (__A, __B);
1603 }
1604 
1605 /* Return a combination of the four 16-bit values in A. The selector
1606  must be an immediate. */
1607 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608 _mm_shuffle_pi16 (__m64 __A, int const __N)
1609 {
1610  unsigned long element_selector_10 = __N & 0x03;
1611  unsigned long element_selector_32 = (__N >> 2) & 0x03;
1612  unsigned long element_selector_54 = (__N >> 4) & 0x03;
1613  unsigned long element_selector_76 = (__N >> 6) & 0x03;
1614  static const unsigned short permute_selectors[4] =
1615  {
1616 #ifdef __LITTLE_ENDIAN__
1617  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1618 #else
1619  0x0607, 0x0405, 0x0203, 0x0001
1620 #endif
1621  };
1622  __m64_union t;
1623  __vector unsigned long long a, p, r;
1624 
1625 #ifdef __LITTLE_ENDIAN__
1626  t.as_short[0] = permute_selectors[element_selector_10];
1627  t.as_short[1] = permute_selectors[element_selector_32];
1628  t.as_short[2] = permute_selectors[element_selector_54];
1629  t.as_short[3] = permute_selectors[element_selector_76];
1630 #else
1631  t.as_short[3] = permute_selectors[element_selector_10];
1632  t.as_short[2] = permute_selectors[element_selector_32];
1633  t.as_short[1] = permute_selectors[element_selector_54];
1634  t.as_short[0] = permute_selectors[element_selector_76];
1635 #endif
1636  p = vec_splats (t.as_m64);
1637  a = vec_splats (__A);
1638  r = vec_perm (a, a, (__vector unsigned char)p);
1639  return (__m64) ((__vector long long) r)[0];
1640 }
1641 
1642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1643 _m_pshufw (__m64 __A, int const __N)
1644 {
1645  return _mm_shuffle_pi16 (__A, __N);
1646 }
1647 
1648 /* Conditionally store byte elements of A into P. The high bit of each
1649  byte in the selector N determines whether the corresponding byte from
1650  A is stored. */
1651 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1653 {
1654  __m64 hibit = 0x8080808080808080UL;
1655  __m64 mask, tmp;
1656  __m64 *p = (__m64*)__P;
1657 
1658  tmp = *p;
1659  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1660  tmp = (tmp & (~mask)) | (__A & mask);
1661  *p = tmp;
1662 }
1663 
1664 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1666 {
1667  _mm_maskmove_si64 (__A, __N, __P);
1668 }
1669 
1670 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_avg_pu8 (__m64 __A, __m64 __B)
1673 {
1674  __vector unsigned char a, b, c;
1675 
1676  a = (__vector unsigned char)vec_splats (__A);
1677  b = (__vector unsigned char)vec_splats (__B);
1678  c = vec_avg (a, b);
1679  return (__m64) ((__vector long long) c)[0];
1680 }
1681 
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _m_pavgb (__m64 __A, __m64 __B)
1684 {
1685  return _mm_avg_pu8 (__A, __B);
1686 }
1687 
1688 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1689 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1690 _mm_avg_pu16 (__m64 __A, __m64 __B)
1691 {
1692  __vector unsigned short a, b, c;
1693 
1694  a = (__vector unsigned short)vec_splats (__A);
1695  b = (__vector unsigned short)vec_splats (__B);
1696  c = vec_avg (a, b);
1697  return (__m64) ((__vector long long) c)[0];
1698 }
1699 
1700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701 _m_pavgw (__m64 __A, __m64 __B)
1702 {
1703  return _mm_avg_pu16 (__A, __B);
1704 }
1705 
1706 /* Compute the sum of the absolute differences of the unsigned 8-bit
1707  values in A and B. Return the value in the lower 16-bit word; the
1708  upper words are cleared. */
1709 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1710 _mm_sad_pu8 (__m64 __A, __m64 __B)
1711 {
1712  __vector unsigned char a, b;
1713  __vector unsigned char vmin, vmax, vabsdiff;
1714  __vector signed int vsum;
1715  const __vector unsigned int zero =
1716  { 0, 0, 0, 0 };
1717  __m64_union result = {0};
1718 
1719  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1720  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1721  vmin = vec_min (a, b);
1722  vmax = vec_max (a, b);
1723  vabsdiff = vec_sub (vmax, vmin);
1724  /* Sum four groups of bytes into integers. */
1725  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1726  /* Sum across four integers with integer result. */
1727  vsum = vec_sums (vsum, (__vector signed int) zero);
1728  /* The sum is in the right most 32-bits of the vector result.
1729  Transfer to a GPR and truncate to 16 bits. */
1730  result.as_short[0] = vsum[3];
1731  return result.as_m64;
1732 }
1733 
1734 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1735 _m_psadbw (__m64 __A, __m64 __B)
1736 {
1737  return _mm_sad_pu8 (__A, __B);
1738 }
1739 
1740 /* Stores the data in A to the address P without polluting the caches. */
1741 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742 _mm_stream_pi (__m64 *__P, __m64 __A)
1743 {
1744  /* Use the data cache block touch for store transient. */
1745  __asm__ (
1746  " dcbtstt 0,%0"
1747  :
1748  : "b" (__P)
1749  : "memory"
1750  );
1751  *__P = __A;
1752 }
1753 
1754 /* Likewise. The address must be 16-byte aligned. */
1755 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756 _mm_stream_ps (float *__P, __m128 __A)
1757 {
1758  /* Use the data cache block touch for store transient. */
1759  __asm__ (
1760  " dcbtstt 0,%0"
1761  :
1762  : "b" (__P)
1763  : "memory"
1764  );
1765  _mm_store_ps (__P, __A);
1766 }
1767 
1768 /* Guarantees that every preceding store is globally visible before
1769  any subsequent store. */
1770 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771 _mm_sfence (void)
1772 {
1773  /* Generate a light weight sync. */
1774  __atomic_thread_fence (__ATOMIC_RELEASE);
1775 }
1776 
1777 /* The execution of the next instruction is delayed by an implementation
1778  specific amount of time. The instruction does not modify the
1779  architectural state. This is after the pop_options pragma because
1780  it does not require SSE support in the processor--the encoding is a
1781  nop on processors that do not support it. */
1782 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783 _mm_pause (void)
1784 {
1785  /* There is no exact match with this construct, but the following is
1786  close to the desired effect. */
1787 #if _ARCH_PWR8
1788  /* On power8 and later processors we can depend on Program Priority
1789  (PRI) and associated "very low" PPI setting. Since we don't know
1790  what PPI this thread is running at we: 1) save the current PRI
1791  from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1792  via the special or 31,31,31 encoding. 3) issue an "isync" to
1793  insure the PRI change takes effect before we execute any more
1794  instructions.
1795  Now we can execute a lwsync (release barrier) while we execute
1796  this thread at "very low" PRI. Finally we restore the original
1797  PRI and continue execution. */
1798  unsigned long __PPR;
1799 
1800  __asm__ volatile (
1801  " mfppr %0;"
1802  " or 31,31,31;"
1803  " isync;"
1804  " lwsync;"
1805  " isync;"
1806  " mtppr %0;"
1807  : "=r" (__PPR)
1808  :
1809  : "memory"
1810  );
1811 #else
1812  /* For older processor where we may not even have Program Priority
1813  controls we can only depend on Heavy Weight Sync. */
1814  __atomic_thread_fence (__ATOMIC_SEQ_CST);
1815 #endif
1816 }
1817 
1818 /* Transpose the 4x4 matrix composed of row[0-3]. */
1819 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1820 do { \
1821  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1822  __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1823  __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1824  __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1825  __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1826  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1827  (__vector long long)__t1); \
1828  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1829  (__vector long long)__t1); \
1830  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1831  (__vector long long)__t3); \
1832  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1833  (__vector long long)__t3); \
1834 } while (0)
1835 
1836 /* For backward source compatibility. */
1837 //# include <emmintrin.h>
1838 
1839 #else
1840 #include_next <xmmintrin.h>
1841 #endif /* defined(__linux__) && defined(__ppc64__) */
1842 
1843 #endif /* _XMMINTRIN_H_INCLUDED */
_mm_andnot_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:430
_mm_movemask_ps
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:2928
_mm_insert_pi16
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition: xmmintrin.h:2218
_mm_cvt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1324
vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2351
_MM_HINT_T2
#define _MM_HINT_T2
Definition: xmmintrin.h:2076
_m_pmaxub
#define _m_pmaxub
Definition: xmmintrin.h:2991
_mm_cmpunord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:976
_mm_comilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1045
_mm_avg_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2411
_mm_rcp_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:253
vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11115
_mm_cmpge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:681
_mm_pause
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
_m_pmovmskb
#define _m_pmovmskb
Definition: xmmintrin.h:2994
vec_vupklsh
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition: altivec.h:12786
_mm_cvt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1378
_mm_ucomile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1214
vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:864
_mm_load1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1704
_mm_ucomigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1239
_mm_cmpord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:951
_mm_store_ss
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1966
_mm_xor_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:467
_m_pinsrw
#define _m_pinsrw
Definition: xmmintrin.h:2989
_mm_cmpgt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:615
_mm_cvt_si2ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1517
int
__device__ int
Definition: __clang_hip_libdevice_declares.h:63
_m_pmaxsw
#define _m_pmaxsw
Definition: xmmintrin.h:2990
_mm_move_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2672
_mm_ucomieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1165
_mm_or_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:448
_MM_HINT_ET0
#define _MM_HINT_ET0
Definition: xmmintrin.h:2072
__a
static __inline__ void int __a
Definition: emmintrin.h:4189
_mm_load_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1682
_mm_comieq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:1020
_m_pextrw
#define _m_pextrw
Definition: xmmintrin.h:2988
_mm_min_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:348
_mm_cvtss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1306
_mm_min_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2292
b
__device__ __2f16 b
Definition: __clang_hip_libdevice_declares.h:314
vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6680
_mm_set1_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1818
_mm_avg_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2430
_mm_min_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:329
_mm_setr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:1892
_mm_ucomige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1264
vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4012
_mm_ucomineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1288
_mm_movelh_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2715
_mm_mulhi_pu16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2329
_mm_storel_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1945
vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5312
_mm_cvttps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1455
_m_pavgw
#define _m_pavgw
Definition: xmmintrin.h:2999
_mm_store1_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2027
_mm_mul_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:139
_mm_stream_pi
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2124
_mm_cvttss_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1397
_mm_cvtsi32_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1494
_mm_cvtt_ps2pi
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1472
_mm_cmpeq_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1163
_mm_prefetch
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition: xmmintrin.h:2107
_mm_movemask_pi8
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2310
vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5277
_mm_cmpeq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:507
vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10021
vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13123
vec_cts
#define vec_cts
Definition: altivec.h:3277
_mm_store_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2008
vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6816
vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2417
vec_vmrglw
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition: altivec.h:5540
_m_pminsw
#define _m_pminsw
Definition: xmmintrin.h:2992
_m_maskmovq
#define _m_maskmovq
Definition: xmmintrin.h:2997
vec_rsqrte
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition: altivec.h:8472
vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2225
_mm_cmpnge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:884
_mm_shuffle_pi16
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition: xmmintrin.h:2365
_mm_set_ps1
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1837
_mm_cmpneq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition: xmmintrin.h:704
_mm_stream_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2143
_MM_HINT_ET1
#define _MM_HINT_ET1
Definition: xmmintrin.h:2073
_mm_loadr_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1766
_mm_cmpnlt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:767
_mm_div_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:200
_mm_cvtps_pi8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2903
_mm_sqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:218
_mm_cmpngt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:859
_mm_mul_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:159
_mm_cvtpu16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2763
altivec.h
_mm_storeh_pi
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:1924
_mm_rcp_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:270
_mm_max_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:390
__D
static __inline__ void short __D
Definition: immintrin.h:369
_mm_extract_pi16
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition: xmmintrin.h:2187
vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1690
_mm_shuffle_ps
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition: xmmintrin.h:2607
__attribute__
int __v4si __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:19
vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14634
_mm_storer_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2066
_mm_movehl_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2694
_mm_cvt_pi2ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1590
_mm_cmpnge_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:906
_mm_unpacklo_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2650
_mm_cvtps_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1362
_mm_cvtpu8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2817
vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1217
_mm_loadl_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1655
vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7913
_mm_sub_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:117
_mm_set_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1864
vec_pack
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7340
_mm_cvtpi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1567
vec_re
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition: altivec.h:8214
vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11800
_mm_load_ps1
#define _mm_load_ps1(p)
Definition: xmmintrin.h:1713
_mm_undefined_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1780
_mm_cmpngt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:837
_mm_cvtpi32x2_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2844
_mm_store_ps1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2047
_mm_and_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:408
_mm_sub_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:96
vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4789
_mm_add_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:74
_mm_sfence
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
vec_ctf
#define vec_ctf(__a, __b)
Definition: altivec.h:3210
vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1568
_mm_rsqrt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:289
_mm_cvtpi16_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2733
_mm_comile_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1069
_mm_storeu_ps
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1987
_mm_comige_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1117
_mm_cmpeq_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:489
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
_mm_loadh_pi
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1628
_mm_cmpge_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:660
_mm_load_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1727
vec_abs
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:113
_mm_rsqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:306
_mm_cmpnle_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:812
_mm_sqrt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:235
_MM_HINT_T1
#define _MM_HINT_T1
Definition: xmmintrin.h:2075
_mm_cmpgt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:636
_mm_max_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2235
_mm_cvtss_f32
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1607
_MM_HINT_NTA
#define _MM_HINT_NTA
Definition: xmmintrin.h:2077
vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5693
_mm_cmpneq_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:723
_mm_cmpord_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:931
_mm_min_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2273
endif
endif() if(clang_vc AND LLVM_APPEND_VC_REV) set(clang_source_dir $
Definition: CMakeLists.txt:15
_m_pshufw
#define _m_pshufw
Definition: xmmintrin.h:2996
_mm_ucomilt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1189
_mm_setzero_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1907
_mm_cmplt_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:549
vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2111
_mm_maskmove_si64
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2392
vec_vupkhsh
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition: altivec.h:12647
_mm_max_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:371
_mm_unpackhi_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2628
_mm_cmplt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:530
_mm_cmple_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:592
_mm_max_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2254
_m_pmulhuw
#define _m_pmulhuw
Definition: xmmintrin.h:2995
c
__device__ __2f16 float c
Definition: __clang_hip_libdevice_declares.h:315
vec_vupkhsb
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition: altivec.h:12628
vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12403
_mm_sad_pu8
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2452
_mm_cmpnle_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:792
_mm_cvtt_ss2si
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1416
_mm_cmple_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:573
_mm_add_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:54
_mm_comigt_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1093
float
__device__ float
Definition: __clang_hip_libdevice_declares.h:22
_m_psadbw
#define _m_psadbw
Definition: xmmintrin.h:3000
_mm_cvtpi8_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2792
vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8519
_mm_set_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1800
_mm_cmpnlt_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:747
_mm_div_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:181
_mm_loadu_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1744
_MM_HINT_T0
#define _MM_HINT_T0
Definition: xmmintrin.h:2074
_mm_comineq_ss
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1141
_mm_cvtps_pi16
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2873
_m_pavgb
#define _m_pavgb
Definition: xmmintrin.h:2998
_mm_cmpunord_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:996
_m_pminub
#define _m_pminub
Definition: xmmintrin.h:2993
vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5042