clang  10.0.0svn
xmmintrin.h
Go to the documentation of this file.
1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18  VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications. */
31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
32 #endif
33 
34 #ifndef _XMMINTRIN_H_INCLUDED
35 #define _XMMINTRIN_H_INCLUDED
36 
37 /* Define four value permute mask */
38 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
39 
40 #include <altivec.h>
41 
42 /* Avoid collisions between altivec.h and strict adherence to C++ and
43  C11 standards. This should eventually be done inside altivec.h itself,
44  but only after testing a full distro build. */
45 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
46  (defined(__STDC_VERSION__) && \
47  __STDC_VERSION__ >= 201112L))
48 #undef vector
49 #undef pixel
50 #undef bool
51 #endif
52 
53 /* We need type definitions from the MMX header file. */
54 #include <mmintrin.h>
55 
56 /* Get _mm_malloc () and _mm_free (). */
57 #if __STDC_HOSTED__
58 #include <mm_malloc.h>
59 #endif
60 
61 /* The Intel API is flexible enough that we must allow aliasing with other
62  vector types, and their scalar components. */
63 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
64 
65 /* Unaligned version of the same type. */
66 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
67  __aligned__ (1)));
68 
69 /* Internal data types for implementing the intrinsics. */
70 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
71 
72 /* Create an undefined vector. */
73 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74 _mm_undefined_ps (void)
75 {
76  __m128 __Y = __Y;
77  return __Y;
78 }
79 
80 /* Create a vector of zeros. */
81 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82 _mm_setzero_ps (void)
83 {
84  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
85 }
86 
87 /* Load four SPFP values from P. The address must be 16-byte aligned. */
88 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 _mm_load_ps (float const *__P)
90 {
91  return ((__m128)vec_ld(0, (__v4sf*)__P));
92 }
93 
94 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
95 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
96 _mm_loadu_ps (float const *__P)
97 {
98  return (vec_vsx_ld(0, __P));
99 }
100 
101 /* Load four SPFP values in reverse order. The address must be aligned. */
102 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_loadr_ps (float const *__P)
104 {
105  __v4sf __tmp;
106  __m128 result;
107  static const __vector unsigned char permute_vector =
108  { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
109  0x17, 0x10, 0x11, 0x12, 0x13 };
110 
111  __tmp = vec_ld (0, (__v4sf *) __P);
112  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
113  return result;
114 }
115 
116 /* Create a vector with all four elements equal to F. */
117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
118 _mm_set1_ps (float __F)
119 {
120  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
121 }
122 
123 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124 _mm_set_ps1 (float __F)
125 {
126  return _mm_set1_ps (__F);
127 }
128 
129 /* Create the vector [Z Y X W]. */
130 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
132 {
133  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
134 }
135 
136 /* Create the vector [W X Y Z]. */
137 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
139 {
140  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
141 }
142 
143 /* Store four SPFP values. The address must be 16-byte aligned. */
144 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
145 _mm_store_ps (float *__P, __m128 __A)
146 {
147  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
148 }
149 
150 /* Store four SPFP values. The address need not be 16-byte aligned. */
151 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm_storeu_ps (float *__P, __m128 __A)
153 {
154  *(__m128_u *)__P = __A;
155 }
156 
157 /* Store four SPFP values in reverse order. The address must be aligned. */
158 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159 _mm_storer_ps (float *__P, __m128 __A)
160 {
161  __v4sf __tmp;
162  static const __vector unsigned char permute_vector =
163  { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
164  0x17, 0x10, 0x11, 0x12, 0x13 };
165 
166  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
167 
168  _mm_store_ps (__P, __tmp);
169 }
170 
171 /* Store the lower SPFP value across four words. */
172 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173 _mm_store1_ps (float *__P, __m128 __A)
174 {
175  __v4sf __va = vec_splat((__v4sf)__A, 0);
176  _mm_store_ps (__P, __va);
177 }
178 
179 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180 _mm_store_ps1 (float *__P, __m128 __A)
181 {
182  _mm_store1_ps (__P, __A);
183 }
184 
185 /* Create a vector with element 0 as F and the rest zero. */
186 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
187 _mm_set_ss (float __F)
188 {
189  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
190 }
191 
192 /* Sets the low SPFP value of A from the low value of B. */
193 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_move_ss (__m128 __A, __m128 __B)
195 {
196  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
197 
198  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
199 }
200 
201 /* Create a vector with element 0 as *P and the rest zero. */
202 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm_load_ss (float const *__P)
204 {
205  return _mm_set_ss (*__P);
206 }
207 
208 /* Stores the lower SPFP value. */
209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm_store_ss (float *__P, __m128 __A)
211 {
212  *__P = ((__v4sf)__A)[0];
213 }
214 
215 /* Perform the respective operation on the lower SPFP (single-precision
216  floating-point) values of A and B; the upper three SPFP values are
217  passed through from A. */
218 
219 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 _mm_add_ss (__m128 __A, __m128 __B)
221 {
222 #ifdef _ARCH_PWR7
223  __m128 a, b, c;
224  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
225  /* PowerISA VSX does not allow partial (for just lower double)
226  results. So to insure we don't generate spurious exceptions
227  (from the upper double values) we splat the lower double
228  before we to the operation. */
229  a = vec_splat (__A, 0);
230  b = vec_splat (__B, 0);
231  c = a + b;
232  /* Then we merge the lower float result with the original upper
233  float elements from __A. */
234  return (vec_sel (__A, c, mask));
235 #else
236  __A[0] = __A[0] + __B[0];
237  return (__A);
238 #endif
239 }
240 
241 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242 _mm_sub_ss (__m128 __A, __m128 __B)
243 {
244 #ifdef _ARCH_PWR7
245  __m128 a, b, c;
246  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
247  /* PowerISA VSX does not allow partial (for just lower double)
248  results. So to insure we don't generate spurious exceptions
249  (from the upper double values) we splat the lower double
250  before we to the operation. */
251  a = vec_splat (__A, 0);
252  b = vec_splat (__B, 0);
253  c = a - b;
254  /* Then we merge the lower float result with the original upper
255  float elements from __A. */
256  return (vec_sel (__A, c, mask));
257 #else
258  __A[0] = __A[0] - __B[0];
259  return (__A);
260 #endif
261 }
262 
263 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264 _mm_mul_ss (__m128 __A, __m128 __B)
265 {
266 #ifdef _ARCH_PWR7
267  __m128 a, b, c;
268  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
269  /* PowerISA VSX does not allow partial (for just lower double)
270  results. So to insure we don't generate spurious exceptions
271  (from the upper double values) we splat the lower double
272  before we to the operation. */
273  a = vec_splat (__A, 0);
274  b = vec_splat (__B, 0);
275  c = a * b;
276  /* Then we merge the lower float result with the original upper
277  float elements from __A. */
278  return (vec_sel (__A, c, mask));
279 #else
280  __A[0] = __A[0] * __B[0];
281  return (__A);
282 #endif
283 }
284 
285 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286 _mm_div_ss (__m128 __A, __m128 __B)
287 {
288 #ifdef _ARCH_PWR7
289  __m128 a, b, c;
290  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
291  /* PowerISA VSX does not allow partial (for just lower double)
292  results. So to insure we don't generate spurious exceptions
293  (from the upper double values) we splat the lower double
294  before we to the operation. */
295  a = vec_splat (__A, 0);
296  b = vec_splat (__B, 0);
297  c = a / b;
298  /* Then we merge the lower float result with the original upper
299  float elements from __A. */
300  return (vec_sel (__A, c, mask));
301 #else
302  __A[0] = __A[0] / __B[0];
303  return (__A);
304 #endif
305 }
306 
307 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308 _mm_sqrt_ss (__m128 __A)
309 {
310  __m128 a, c;
311  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
312  /* PowerISA VSX does not allow partial (for just lower double)
313  * results. So to insure we don't generate spurious exceptions
314  * (from the upper double values) we splat the lower double
315  * before we to the operation. */
316  a = vec_splat (__A, 0);
317  c = vec_sqrt (a);
318  /* Then we merge the lower float result with the original upper
319  * float elements from __A. */
320  return (vec_sel (__A, c, mask));
321 }
322 
323 /* Perform the respective operation on the four SPFP values in A and B. */
324 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325 _mm_add_ps (__m128 __A, __m128 __B)
326 {
327  return (__m128) ((__v4sf)__A + (__v4sf)__B);
328 }
329 
330 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
331 _mm_sub_ps (__m128 __A, __m128 __B)
332 {
333  return (__m128) ((__v4sf)__A - (__v4sf)__B);
334 }
335 
336 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
337 _mm_mul_ps (__m128 __A, __m128 __B)
338 {
339  return (__m128) ((__v4sf)__A * (__v4sf)__B);
340 }
341 
342 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_div_ps (__m128 __A, __m128 __B)
344 {
345  return (__m128) ((__v4sf)__A / (__v4sf)__B);
346 }
347 
348 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_sqrt_ps (__m128 __A)
350 {
351  return (vec_sqrt ((__v4sf)__A));
352 }
353 
354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_rcp_ps (__m128 __A)
356 {
357  return (vec_re ((__v4sf)__A));
358 }
359 
360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_rsqrt_ps (__m128 __A)
362 {
363  return (vec_rsqrte (__A));
364 }
365 
366 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_rcp_ss (__m128 __A)
368 {
369  __m128 a, c;
370  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
371  /* PowerISA VSX does not allow partial (for just lower double)
372  * results. So to insure we don't generate spurious exceptions
373  * (from the upper double values) we splat the lower double
374  * before we to the operation. */
375  a = vec_splat (__A, 0);
376  c = _mm_rcp_ps (a);
377  /* Then we merge the lower float result with the original upper
378  * float elements from __A. */
379  return (vec_sel (__A, c, mask));
380 }
381 
382 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383 _mm_rsqrt_ss (__m128 __A)
384 {
385  __m128 a, c;
386  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
387  /* PowerISA VSX does not allow partial (for just lower double)
388  * results. So to insure we don't generate spurious exceptions
389  * (from the upper double values) we splat the lower double
390  * before we to the operation. */
391  a = vec_splat (__A, 0);
392  c = vec_rsqrte (a);
393  /* Then we merge the lower float result with the original upper
394  * float elements from __A. */
395  return (vec_sel (__A, c, mask));
396 }
397 
398 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399 _mm_min_ss (__m128 __A, __m128 __B)
400 {
401  __v4sf a, b, c;
402  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
403  /* PowerISA VSX does not allow partial (for just lower float)
404  * results. So to insure we don't generate spurious exceptions
405  * (from the upper float values) we splat the lower float
406  * before we to the operation. */
407  a = vec_splat ((__v4sf)__A, 0);
408  b = vec_splat ((__v4sf)__B, 0);
409  c = vec_min (a, b);
410  /* Then we merge the lower float result with the original upper
411  * float elements from __A. */
412  return (vec_sel ((__v4sf)__A, c, mask));
413 }
414 
415 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 _mm_max_ss (__m128 __A, __m128 __B)
417 {
418  __v4sf a, b, c;
419  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
420  /* PowerISA VSX does not allow partial (for just lower float)
421  * results. So to insure we don't generate spurious exceptions
422  * (from the upper float values) we splat the lower float
423  * before we to the operation. */
424  a = vec_splat (__A, 0);
425  b = vec_splat (__B, 0);
426  c = vec_max (a, b);
427  /* Then we merge the lower float result with the original upper
428  * float elements from __A. */
429  return (vec_sel ((__v4sf)__A, c, mask));
430 }
431 
432 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433 _mm_min_ps (__m128 __A, __m128 __B)
434 {
435  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
436  return vec_sel (__B, __A, m);
437 }
438 
439 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
440 _mm_max_ps (__m128 __A, __m128 __B)
441 {
442  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
443  return vec_sel (__B, __A, m);
444 }
445 
446 /* Perform logical bit-wise operations on 128-bit values. */
447 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448 _mm_and_ps (__m128 __A, __m128 __B)
449 {
450  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
451 // return __builtin_ia32_andps (__A, __B);
452 }
453 
454 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
455 _mm_andnot_ps (__m128 __A, __m128 __B)
456 {
457  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
458 }
459 
460 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461 _mm_or_ps (__m128 __A, __m128 __B)
462 {
463  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
464 }
465 
466 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467 _mm_xor_ps (__m128 __A, __m128 __B)
468 {
469  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
470 }
471 
472 /* Perform a comparison on the four SPFP values of A and B. For each
473  element, if the comparison is true, place a mask of all ones in the
474  result, otherwise a mask of zeros. */
475 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476 _mm_cmpeq_ps (__m128 __A, __m128 __B)
477 {
478  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
479 }
480 
481 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 _mm_cmplt_ps (__m128 __A, __m128 __B)
483 {
484  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
485 }
486 
487 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 _mm_cmple_ps (__m128 __A, __m128 __B)
489 {
490  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
491 }
492 
493 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
494 _mm_cmpgt_ps (__m128 __A, __m128 __B)
495 {
496  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
497 }
498 
499 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
500 _mm_cmpge_ps (__m128 __A, __m128 __B)
501 {
502  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
503 }
504 
505 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 _mm_cmpneq_ps (__m128 __A, __m128 __B)
507 {
508  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
509  return ((__m128)vec_nor (temp, temp));
510 }
511 
512 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
514 {
515  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
516 }
517 
518 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _mm_cmpnle_ps (__m128 __A, __m128 __B)
520 {
521  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
522 }
523 
524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525 _mm_cmpngt_ps (__m128 __A, __m128 __B)
526 {
527  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
528 }
529 
530 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
531 _mm_cmpnge_ps (__m128 __A, __m128 __B)
532 {
533  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
534 }
535 
536 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
537 _mm_cmpord_ps (__m128 __A, __m128 __B)
538 {
539  __vector unsigned int a, b;
540  __vector unsigned int c, d;
541  static const __vector unsigned int float_exp_mask =
542  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
543 
544  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
545  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
546  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
547  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
548  return ((__m128 ) vec_and (c, d));
549 }
550 
551 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
552 _mm_cmpunord_ps (__m128 __A, __m128 __B)
553 {
554  __vector unsigned int a, b;
555  __vector unsigned int c, d;
556  static const __vector unsigned int float_exp_mask =
557  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
558 
559  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
560  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
561  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
562  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
563  return ((__m128 ) vec_or (c, d));
564 }
565 
566 /* Perform a comparison on the lower SPFP values of A and B. If the
567  comparison is true, place a mask of all ones in the result, otherwise a
568  mask of zeros. The upper three SPFP values are passed through from A. */
569 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570 _mm_cmpeq_ss (__m128 __A, __m128 __B)
571 {
572  static const __vector unsigned int mask =
573  { 0xffffffff, 0, 0, 0 };
574  __v4sf a, b, c;
575  /* PowerISA VMX does not allow partial (for just element 0)
576  * results. So to insure we don't generate spurious exceptions
577  * (from the upper elements) we splat the lower float
578  * before we to the operation. */
579  a = vec_splat ((__v4sf) __A, 0);
580  b = vec_splat ((__v4sf) __B, 0);
581  c = (__v4sf) vec_cmpeq(a, b);
582  /* Then we merge the lower float result with the original upper
583  * float elements from __A. */
584  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
585 }
586 
587 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588 _mm_cmplt_ss (__m128 __A, __m128 __B)
589 {
590  static const __vector unsigned int mask =
591  { 0xffffffff, 0, 0, 0 };
592  __v4sf a, b, c;
593  /* PowerISA VMX does not allow partial (for just element 0)
594  * results. So to insure we don't generate spurious exceptions
595  * (from the upper elements) we splat the lower float
596  * before we to the operation. */
597  a = vec_splat ((__v4sf) __A, 0);
598  b = vec_splat ((__v4sf) __B, 0);
599  c = (__v4sf) vec_cmplt(a, b);
600  /* Then we merge the lower float result with the original upper
601  * float elements from __A. */
602  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
603 }
604 
605 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _mm_cmple_ss (__m128 __A, __m128 __B)
607 {
608  static const __vector unsigned int mask =
609  { 0xffffffff, 0, 0, 0 };
610  __v4sf a, b, c;
611  /* PowerISA VMX does not allow partial (for just element 0)
612  * results. So to insure we don't generate spurious exceptions
613  * (from the upper elements) we splat the lower float
614  * before we to the operation. */
615  a = vec_splat ((__v4sf) __A, 0);
616  b = vec_splat ((__v4sf) __B, 0);
617  c = (__v4sf) vec_cmple(a, b);
618  /* Then we merge the lower float result with the original upper
619  * float elements from __A. */
620  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
621 }
622 
623 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 _mm_cmpgt_ss (__m128 __A, __m128 __B)
625 {
626  static const __vector unsigned int mask =
627  { 0xffffffff, 0, 0, 0 };
628  __v4sf a, b, c;
629  /* PowerISA VMX does not allow partial (for just element 0)
630  * results. So to insure we don't generate spurious exceptions
631  * (from the upper elements) we splat the lower float
632  * before we to the operation. */
633  a = vec_splat ((__v4sf) __A, 0);
634  b = vec_splat ((__v4sf) __B, 0);
635  c = (__v4sf) vec_cmpgt(a, b);
636  /* Then we merge the lower float result with the original upper
637  * float elements from __A. */
638  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
639 }
640 
641 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm_cmpge_ss (__m128 __A, __m128 __B)
643 {
644  static const __vector unsigned int mask =
645  { 0xffffffff, 0, 0, 0 };
646  __v4sf a, b, c;
647  /* PowerISA VMX does not allow partial (for just element 0)
648  * results. So to insure we don't generate spurious exceptions
649  * (from the upper elements) we splat the lower float
650  * before we to the operation. */
651  a = vec_splat ((__v4sf) __A, 0);
652  b = vec_splat ((__v4sf) __B, 0);
653  c = (__v4sf) vec_cmpge(a, b);
654  /* Then we merge the lower float result with the original upper
655  * float elements from __A. */
656  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
657 }
658 
659 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_cmpneq_ss (__m128 __A, __m128 __B)
661 {
662  static const __vector unsigned int mask =
663  { 0xffffffff, 0, 0, 0 };
664  __v4sf a, b, c;
665  /* PowerISA VMX does not allow partial (for just element 0)
666  * results. So to insure we don't generate spurious exceptions
667  * (from the upper elements) we splat the lower float
668  * before we to the operation. */
669  a = vec_splat ((__v4sf) __A, 0);
670  b = vec_splat ((__v4sf) __B, 0);
671  c = (__v4sf) vec_cmpeq(a, b);
672  c = vec_nor (c, c);
673  /* Then we merge the lower float result with the original upper
674  * float elements from __A. */
675  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
676 }
677 
678 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
680 {
681  static const __vector unsigned int mask =
682  { 0xffffffff, 0, 0, 0 };
683  __v4sf a, b, c;
684  /* PowerISA VMX does not allow partial (for just element 0)
685  * results. So to insure we don't generate spurious exceptions
686  * (from the upper elements) we splat the lower float
687  * before we to the operation. */
688  a = vec_splat ((__v4sf) __A, 0);
689  b = vec_splat ((__v4sf) __B, 0);
690  c = (__v4sf) vec_cmpge(a, b);
691  /* Then we merge the lower float result with the original upper
692  * float elements from __A. */
693  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
694 }
695 
696 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm_cmpnle_ss (__m128 __A, __m128 __B)
698 {
699  static const __vector unsigned int mask =
700  { 0xffffffff, 0, 0, 0 };
701  __v4sf a, b, c;
702  /* PowerISA VMX does not allow partial (for just element 0)
703  * results. So to insure we don't generate spurious exceptions
704  * (from the upper elements) we splat the lower float
705  * before we to the operation. */
706  a = vec_splat ((__v4sf) __A, 0);
707  b = vec_splat ((__v4sf) __B, 0);
708  c = (__v4sf) vec_cmpgt(a, b);
709  /* Then we merge the lower float result with the original upper
710  * float elements from __A. */
711  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
712 }
713 
714 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm_cmpngt_ss (__m128 __A, __m128 __B)
716 {
717  static const __vector unsigned int mask =
718  { 0xffffffff, 0, 0, 0 };
719  __v4sf a, b, c;
720  /* PowerISA VMX does not allow partial (for just element 0)
721  * results. So to insure we don't generate spurious exceptions
722  * (from the upper elements) we splat the lower float
723  * before we to the operation. */
724  a = vec_splat ((__v4sf) __A, 0);
725  b = vec_splat ((__v4sf) __B, 0);
726  c = (__v4sf) vec_cmple(a, b);
727  /* Then we merge the lower float result with the original upper
728  * float elements from __A. */
729  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
730 }
731 
732 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733 _mm_cmpnge_ss (__m128 __A, __m128 __B)
734 {
735  static const __vector unsigned int mask =
736  { 0xffffffff, 0, 0, 0 };
737  __v4sf a, b, c;
738  /* PowerISA VMX does not allow partial (for just element 0)
739  * results. So to insure we don't generate spurious exceptions
740  * (from the upper elements) we splat the lower float
741  * before we do the operation. */
742  a = vec_splat ((__v4sf) __A, 0);
743  b = vec_splat ((__v4sf) __B, 0);
744  c = (__v4sf) vec_cmplt(a, b);
745  /* Then we merge the lower float result with the original upper
746  * float elements from __A. */
747  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
748 }
749 
750 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
751 _mm_cmpord_ss (__m128 __A, __m128 __B)
752 {
753  __vector unsigned int a, b;
754  __vector unsigned int c, d;
755  static const __vector unsigned int float_exp_mask =
756  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
757  static const __vector unsigned int mask =
758  { 0xffffffff, 0, 0, 0 };
759 
760  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
761  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
762  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
763  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
764  c = vec_and (c, d);
765  /* Then we merge the lower float result with the original upper
766  * float elements from __A. */
767  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
768 }
769 
770 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
771 _mm_cmpunord_ss (__m128 __A, __m128 __B)
772 {
773  __vector unsigned int a, b;
774  __vector unsigned int c, d;
775  static const __vector unsigned int float_exp_mask =
776  { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
777  static const __vector unsigned int mask =
778  { 0xffffffff, 0, 0, 0 };
779 
780  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
781  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
782  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
783  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
784  c = vec_or (c, d);
785  /* Then we merge the lower float result with the original upper
786  * float elements from __A. */
787  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
788 }
789 
790 /* Compare the lower SPFP values of A and B and return 1 if true
791  and 0 if false. */
792 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
793 _mm_comieq_ss (__m128 __A, __m128 __B)
794 {
795  return (__A[0] == __B[0]);
796 }
797 
798 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
799 _mm_comilt_ss (__m128 __A, __m128 __B)
800 {
801  return (__A[0] < __B[0]);
802 }
803 
804 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805 _mm_comile_ss (__m128 __A, __m128 __B)
806 {
807  return (__A[0] <= __B[0]);
808 }
809 
810 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _mm_comigt_ss (__m128 __A, __m128 __B)
812 {
813  return (__A[0] > __B[0]);
814 }
815 
816 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817 _mm_comige_ss (__m128 __A, __m128 __B)
818 {
819  return (__A[0] >= __B[0]);
820 }
821 
822 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823 _mm_comineq_ss (__m128 __A, __m128 __B)
824 {
825  return (__A[0] != __B[0]);
826 }
827 
828 /* FIXME
829  * The __mm_ucomi??_ss implementations below are exactly the same as
830  * __mm_comi??_ss because GCC for PowerPC only generates unordered
831  * compares (scalar and vector).
832  * Technically __mm_comieq_ss et al should be using the ordered
833  * compare and signal for QNaNs.
834  * The __mm_ucomieq_sd et all should be OK, as is.
835  */
836 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837 _mm_ucomieq_ss (__m128 __A, __m128 __B)
838 {
839  return (__A[0] == __B[0]);
840 }
841 
842 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843 _mm_ucomilt_ss (__m128 __A, __m128 __B)
844 {
845  return (__A[0] < __B[0]);
846 }
847 
848 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
849 _mm_ucomile_ss (__m128 __A, __m128 __B)
850 {
851  return (__A[0] <= __B[0]);
852 }
853 
854 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
855 _mm_ucomigt_ss (__m128 __A, __m128 __B)
856 {
857  return (__A[0] > __B[0]);
858 }
859 
860 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
861 _mm_ucomige_ss (__m128 __A, __m128 __B)
862 {
863  return (__A[0] >= __B[0]);
864 }
865 
866 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867 _mm_ucomineq_ss (__m128 __A, __m128 __B)
868 {
869  return (__A[0] != __B[0]);
870 }
871 
872 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 _mm_cvtss_f32 (__m128 __A)
874 {
875  return ((__v4sf)__A)[0];
876 }
877 
878 /* Convert the lower SPFP value to a 32-bit integer according to the current
879  rounding mode. */
880 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881 _mm_cvtss_si32 (__m128 __A)
882 {
883  __m64 res = 0;
884 #ifdef _ARCH_PWR8
885  double dtmp;
886  __asm__(
887 #ifdef __LITTLE_ENDIAN__
888  "xxsldwi %x0,%x0,%x0,3;\n"
889 #endif
890  "xscvspdp %x2,%x0;\n"
891  "fctiw %2,%2;\n"
892  "mfvsrd %1,%x2;\n"
893  : "+wa" (__A),
894  "=r" (res),
895  "=f" (dtmp)
896  : );
897 #else
898  res = __builtin_rint(__A[0]);
899 #endif
900  return (res);
901 }
902 
903 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_cvt_ss2si (__m128 __A)
905 {
906  return _mm_cvtss_si32 (__A);
907 }
908 
909 /* Convert the lower SPFP value to a 32-bit integer according to the
910  current rounding mode. */
911 
912 /* Intel intrinsic. */
913 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914 _mm_cvtss_si64 (__m128 __A)
915 {
916  __m64 res = 0;
917 #ifdef _ARCH_PWR8
918  double dtmp;
919  __asm__(
920 #ifdef __LITTLE_ENDIAN__
921  "xxsldwi %x0,%x0,%x0,3;\n"
922 #endif
923  "xscvspdp %x2,%x0;\n"
924  "fctid %2,%2;\n"
925  "mfvsrd %1,%x2;\n"
926  : "+wa" (__A),
927  "=r" (res),
928  "=f" (dtmp)
929  : );
930 #else
931  res = __builtin_llrint(__A[0]);
932 #endif
933  return (res);
934 }
935 
936 /* Microsoft intrinsic. */
937 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
938 _mm_cvtss_si64x (__m128 __A)
939 {
940  return _mm_cvtss_si64 ((__v4sf) __A);
941 }
942 
943 /* Constants for use with _mm_prefetch. */
945 {
946  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
953 };
954 
955 /* Loads one cache line from address P to a location "closer" to the
956  processor. The selector I specifies the type of prefetch operation. */
957 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_prefetch (const void *__P, enum _mm_hint __I)
959 {
960  /* Current PowerPC will ignores the hint parameters. */
961  __builtin_prefetch (__P);
962 }
963 
964 /* Convert the two lower SPFP values to 32-bit integers according to the
965  current rounding mode. Return the integers in packed form. */
966 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _mm_cvtps_pi32 (__m128 __A)
968 {
969  /* Splat two lower SPFP values to both halves. */
970  __v4sf temp, rounded;
971  __vector unsigned long long result;
972 
973  /* Splat two lower SPFP values to both halves. */
974  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
975  rounded = vec_rint(temp);
976  result = (__vector unsigned long long) vec_cts (rounded, 0);
977 
978  return (__m64) ((__vector long long) result)[0];
979 }
980 
981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
982 _mm_cvt_ps2pi (__m128 __A)
983 {
984  return _mm_cvtps_pi32 (__A);
985 }
986 
987 /* Truncate the lower SPFP value to a 32-bit integer. */
988 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
989 _mm_cvttss_si32 (__m128 __A)
990 {
991  /* Extract the lower float element. */
992  float temp = __A[0];
993  /* truncate to 32-bit integer and return. */
994  return temp;
995 }
996 
997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998 _mm_cvtt_ss2si (__m128 __A)
999 {
1000  return _mm_cvttss_si32 (__A);
1001 }
1002 
1003 /* Intel intrinsic. */
1004 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _mm_cvttss_si64 (__m128 __A)
1006 {
1007  /* Extract the lower float element. */
1008  float temp = __A[0];
1009  /* truncate to 32-bit integer and return. */
1010  return temp;
1011 }
1012 
1013 /* Microsoft intrinsic. */
1014 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvttss_si64x (__m128 __A)
1016 {
1017  /* Extract the lower float element. */
1018  float temp = __A[0];
1019  /* truncate to 32-bit integer and return. */
1020  return temp;
1021 }
1022 
1023 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1024  integers in packed form. */
1025 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvttps_pi32 (__m128 __A)
1027 {
1028  __v4sf temp;
1029  __vector unsigned long long result;
1030 
1031  /* Splat two lower SPFP values to both halves. */
1032  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1033  result = (__vector unsigned long long) vec_cts (temp, 0);
1034 
1035  return (__m64) ((__vector long long) result)[0];
1036 }
1037 
1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039 _mm_cvtt_ps2pi (__m128 __A)
1040 {
1041  return _mm_cvttps_pi32 (__A);
1042 }
1043 
1044 /* Convert B to a SPFP value and insert it as element zero in A. */
1045 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsi32_ss (__m128 __A, int __B)
1047 {
1048  float temp = __B;
1049  __A[0] = temp;
1050 
1051  return __A;
1052 }
1053 
1054 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1055 _mm_cvt_si2ss (__m128 __A, int __B)
1056 {
1057  return _mm_cvtsi32_ss (__A, __B);
1058 }
1059 
1060 /* Convert B to a SPFP value and insert it as element zero in A. */
1061 /* Intel intrinsic. */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi64_ss (__m128 __A, long long __B)
1064 {
1065  float temp = __B;
1066  __A[0] = temp;
1067 
1068  return __A;
1069 }
1070 
1071 /* Microsoft intrinsic. */
1072 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1073 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1074 {
1075  return _mm_cvtsi64_ss (__A, __B);
1076 }
1077 
1078 /* Convert the two 32-bit values in B to SPFP form and insert them
1079  as the two lower elements in A. */
1080 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1081 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1082 {
1083  __vector signed int vm1;
1084  __vector float vf1;
1085 
1086  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1087  vf1 = (__vector float) vec_ctf (vm1, 0);
1088 
1089  return ((__m128) (__vector unsigned long long)
1090  { ((__vector unsigned long long)vf1) [0],
1091  ((__vector unsigned long long)__A) [1]});
1092 }
1093 
1094 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1095 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1096 {
1097  return _mm_cvtpi32_ps (__A, __B);
1098 }
1099 
1100 /* Convert the four signed 16-bit values in A to SPFP form. */
1101 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm_cvtpi16_ps (__m64 __A)
1103 {
1104  __vector signed short vs8;
1105  __vector signed int vi4;
1106  __vector float vf1;
1107 
1108  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1109  vi4 = vec_vupklsh (vs8);
1110  vf1 = (__vector float) vec_ctf (vi4, 0);
1111 
1112  return (__m128) vf1;
1113 }
1114 
1115 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1116 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117 _mm_cvtpu16_ps (__m64 __A)
1118 {
1119  const __vector unsigned short zero =
1120  { 0, 0, 0, 0, 0, 0, 0, 0 };
1121  __vector unsigned short vs8;
1122  __vector unsigned int vi4;
1123  __vector float vf1;
1124 
1125  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1126  vi4 = (__vector unsigned int) vec_mergel
1127 #ifdef __LITTLE_ENDIAN__
1128  (vs8, zero);
1129 #else
1130  (zero, vs8);
1131 #endif
1132  vf1 = (__vector float) vec_ctf (vi4, 0);
1133 
1134  return (__m128) vf1;
1135 }
1136 
1137 /* Convert the low four signed 8-bit values in A to SPFP form. */
1138 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139 _mm_cvtpi8_ps (__m64 __A)
1140 {
1141  __vector signed char vc16;
1142  __vector signed short vs8;
1143  __vector signed int vi4;
1144  __vector float vf1;
1145 
1146  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1147  vs8 = vec_vupkhsb (vc16);
1148  vi4 = vec_vupkhsh (vs8);
1149  vf1 = (__vector float) vec_ctf (vi4, 0);
1150 
1151  return (__m128) vf1;
1152 }
1153 
1154 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1155 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 
1157 _mm_cvtpu8_ps (__m64 __A)
1158 {
1159  const __vector unsigned char zero =
1160  { 0, 0, 0, 0, 0, 0, 0, 0 };
1161  __vector unsigned char vc16;
1162  __vector unsigned short vs8;
1163  __vector unsigned int vi4;
1164  __vector float vf1;
1165 
1166  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1167 #ifdef __LITTLE_ENDIAN__
1168  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1169  vi4 = (__vector unsigned int) vec_mergeh (vs8,
1170  (__vector unsigned short) zero);
1171 #else
1172  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1173  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1174  vs8);
1175 #endif
1176  vf1 = (__vector float) vec_ctf (vi4, 0);
1177 
1178  return (__m128) vf1;
1179 }
1180 
1181 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1182 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1183 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1184 {
1185  __vector signed int vi4;
1186  __vector float vf4;
1187 
1188  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1189  vf4 = (__vector float) vec_ctf (vi4, 0);
1190  return (__m128) vf4;
1191 }
1192 
1193 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1194 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm_cvtps_pi16 (__m128 __A)
1196 {
1197  __v4sf rounded;
1198  __vector signed int temp;
1199  __vector unsigned long long result;
1200 
1201  rounded = vec_rint(__A);
1202  temp = vec_cts (rounded, 0);
1203  result = (__vector unsigned long long) vec_pack (temp, temp);
1204 
1205  return (__m64) ((__vector long long) result)[0];
1206 }
1207 
1208 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1209 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210 _mm_cvtps_pi8 (__m128 __A)
1211 {
1212  __v4sf rounded;
1213  __vector signed int tmp_i;
1214  static const __vector signed int zero = {0, 0, 0, 0};
1215  __vector signed short tmp_s;
1216  __vector signed char res_v;
1217 
1218  rounded = vec_rint(__A);
1219  tmp_i = vec_cts (rounded, 0);
1220  tmp_s = vec_pack (tmp_i, zero);
1221  res_v = vec_pack (tmp_s, tmp_s);
1222  return (__m64) ((__vector long long) res_v)[0];
1223 }
1224 
1225 /* Selects four specific SPFP values from A and B based on MASK. */
1226 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 
1228 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1230  unsigned long element_selector_10 = __mask & 0x03;
1231  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1232  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1233  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1234  static const unsigned int permute_selectors[4] =
1235  {
1236 #ifdef __LITTLE_ENDIAN__
1237  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1238 #else
1239  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1240 #endif
1241  };
1242  __vector unsigned int t;
1243 
1244  t[0] = permute_selectors[element_selector_10];
1245  t[1] = permute_selectors[element_selector_32];
1246  t[2] = permute_selectors[element_selector_54] + 0x10101010;
1247  t[3] = permute_selectors[element_selector_76] + 0x10101010;
1248  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1249 }
1250 
1251 /* Selects and interleaves the upper two SPFP values from A and B. */
1252 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1254 {
1255  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1256 }
1257 
1258 /* Selects and interleaves the lower two SPFP values from A and B. */
1259 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1261 {
1262  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1263 }
1264 
1265 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1266  the lower two values are passed through from A. */
1267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1270  __vector unsigned long long __a = (__vector unsigned long long)__A;
1271  __vector unsigned long long __p = vec_splats(*__P);
1272  __a [1] = __p [1];
1273 
1274  return (__m128)__a;
1275 }
1276 
1277 /* Stores the upper two SPFP values of A into P. */
1278 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279 _mm_storeh_pi (__m64 *__P, __m128 __A)
1280 {
1281  __vector unsigned long long __a = (__vector unsigned long long) __A;
1282 
1283  *__P = __a[1];
1284 }
1285 
1286 /* Moves the upper two values of B into the lower two values of A. */
1287 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 _mm_movehl_ps (__m128 __A, __m128 __B)
1289 {
1290  return (__m128) vec_mergel ((__vector unsigned long long)__B,
1291  (__vector unsigned long long)__A);
1292 }
1293 
1294 /* Moves the lower two values of B into the upper two values of A. */
1295 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1296 _mm_movelh_ps (__m128 __A, __m128 __B)
1297 {
1298  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1299  (__vector unsigned long long)__B);
1300 }
1301 
1302 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1303  the upper two values are passed through from A. */
1304 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1306 {
1307  __vector unsigned long long __a = (__vector unsigned long long)__A;
1308  __vector unsigned long long __p = vec_splats(*__P);
1309  __a [0] = __p [0];
1310 
1311  return (__m128)__a;
1312 }
1313 
1314 /* Stores the lower two SPFP values of A into P. */
1315 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316 _mm_storel_pi (__m64 *__P, __m128 __A)
1317 {
1318  __vector unsigned long long __a = (__vector unsigned long long) __A;
1319 
1320  *__P = __a[0];
1321 }
1322 
1323 #ifdef _ARCH_PWR8
1324 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1325 
1326 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1327 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_movemask_ps (__m128 __A)
1329 {
1330  __vector unsigned long long result;
1331  static const __vector unsigned int perm_mask =
1332  {
1333 #ifdef __LITTLE_ENDIAN__
1334  0x00204060, 0x80808080, 0x80808080, 0x80808080
1335 #else
1336  0x80808080, 0x80808080, 0x80808080, 0x00204060
1337 #endif
1338  };
1339 
1340  result = ((__vector unsigned long long)
1341  vec_vbpermq ((__vector unsigned char) __A,
1342  (__vector unsigned char) perm_mask));
1343 
1344 #ifdef __LITTLE_ENDIAN__
1345  return result[1];
1346 #else
1347  return result[0];
1348 #endif
1349 }
1350 #endif /* _ARCH_PWR8 */
1351 
1352 /* Create a vector with all four elements equal to *P. */
1353 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354 _mm_load1_ps (float const *__P)
1355 {
1356  return _mm_set1_ps (*__P);
1357 }
1358 
1359 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1360 _mm_load_ps1 (float const *__P)
1361 {
1362  return _mm_load1_ps (__P);
1363 }
1364 
1365 /* Extracts one of the four words of A. The selector N must be immediate. */
1366 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 _mm_extract_pi16 (__m64 const __A, int const __N)
1369  unsigned int shiftr = __N & 3;
1370 #ifdef __BIG_ENDIAN__
1371  shiftr = 3 - shiftr;
1372 #endif
1373 
1374  return ((__A >> (shiftr * 16)) & 0xffff);
1375 }
1376 
1377 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _m_pextrw (__m64 const __A, int const __N)
1379 {
1380  return _mm_extract_pi16 (__A, __N);
1381 }
1382 
1383 /* Inserts word D into one of four words of A. The selector N must be
1384  immediate. */
1385 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1387 {
1388  const int shiftl = (__N & 3) * 16;
1389  const __m64 shiftD = (const __m64) __D << shiftl;
1390  const __m64 mask = 0xffffUL << shiftl;
1391  __m64 result = (__A & (~mask)) | (shiftD & mask);
1392 
1393  return (result);
1394 }
1395 
1396 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1398 {
1399  return _mm_insert_pi16 (__A, __D, __N);
1400 }
1401 
1402 /* Compute the element-wise maximum of signed 16-bit values. */
1403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404 
1405 _mm_max_pi16 (__m64 __A, __m64 __B)
1406 {
1407 #if _ARCH_PWR8
1408  __vector signed short a, b, r;
1409  __vector __bool short c;
1410 
1411  a = (__vector signed short)vec_splats (__A);
1412  b = (__vector signed short)vec_splats (__B);
1413  c = (__vector __bool short)vec_cmpgt (a, b);
1414  r = vec_sel (b, a, c);
1415  return (__m64) ((__vector long long) r)[0];
1416 #else
1417  __m64_union m1, m2, res;
1418 
1419  m1.as_m64 = __A;
1420  m2.as_m64 = __B;
1421 
1422  res.as_short[0] =
1423  (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1424  res.as_short[1] =
1425  (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1426  res.as_short[2] =
1427  (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1428  res.as_short[3] =
1429  (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1430 
1431  return (__m64) res.as_m64;
1432 #endif
1433 }
1434 
1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _m_pmaxsw (__m64 __A, __m64 __B)
1437 {
1438  return _mm_max_pi16 (__A, __B);
1439 }
1440 
1441 /* Compute the element-wise maximum of unsigned 8-bit values. */
1442 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1443 _mm_max_pu8 (__m64 __A, __m64 __B)
1444 {
1445 #if _ARCH_PWR8
1446  __vector unsigned char a, b, r;
1447  __vector __bool char c;
1448 
1449  a = (__vector unsigned char)vec_splats (__A);
1450  b = (__vector unsigned char)vec_splats (__B);
1451  c = (__vector __bool char)vec_cmpgt (a, b);
1452  r = vec_sel (b, a, c);
1453  return (__m64) ((__vector long long) r)[0];
1454 #else
1455  __m64_union m1, m2, res;
1456  long i;
1457 
1458  m1.as_m64 = __A;
1459  m2.as_m64 = __B;
1460 
1461 
1462  for (i = 0; i < 8; i++)
1463  res.as_char[i] =
1464  ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1465  m1.as_char[i] : m2.as_char[i];
1466 
1467  return (__m64) res.as_m64;
1468 #endif
1469 }
1470 
1471 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1472 _m_pmaxub (__m64 __A, __m64 __B)
1473 {
1474  return _mm_max_pu8 (__A, __B);
1475 }
1476 
1477 /* Compute the element-wise minimum of signed 16-bit values. */
1478 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1479 _mm_min_pi16 (__m64 __A, __m64 __B)
1480 {
1481 #if _ARCH_PWR8
1482  __vector signed short a, b, r;
1483  __vector __bool short c;
1484 
1485  a = (__vector signed short)vec_splats (__A);
1486  b = (__vector signed short)vec_splats (__B);
1487  c = (__vector __bool short)vec_cmplt (a, b);
1488  r = vec_sel (b, a, c);
1489  return (__m64) ((__vector long long) r)[0];
1490 #else
1491  __m64_union m1, m2, res;
1492 
1493  m1.as_m64 = __A;
1494  m2.as_m64 = __B;
1495 
1496  res.as_short[0] =
1497  (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1498  res.as_short[1] =
1499  (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1500  res.as_short[2] =
1501  (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1502  res.as_short[3] =
1503  (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1504 
1505  return (__m64) res.as_m64;
1506 #endif
1507 }
1508 
1509 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1510 _m_pminsw (__m64 __A, __m64 __B)
1511 {
1512  return _mm_min_pi16 (__A, __B);
1513 }
1514 
1515 /* Compute the element-wise minimum of unsigned 8-bit values. */
1516 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1517 _mm_min_pu8 (__m64 __A, __m64 __B)
1518 {
1519 #if _ARCH_PWR8
1520  __vector unsigned char a, b, r;
1521  __vector __bool char c;
1522 
1523  a = (__vector unsigned char)vec_splats (__A);
1524  b = (__vector unsigned char)vec_splats (__B);
1525  c = (__vector __bool char)vec_cmplt (a, b);
1526  r = vec_sel (b, a, c);
1527  return (__m64) ((__vector long long) r)[0];
1528 #else
1529  __m64_union m1, m2, res;
1530  long i;
1531 
1532  m1.as_m64 = __A;
1533  m2.as_m64 = __B;
1534 
1535 
1536  for (i = 0; i < 8; i++)
1537  res.as_char[i] =
1538  ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1539  m1.as_char[i] : m2.as_char[i];
1540 
1541  return (__m64) res.as_m64;
1542 #endif
1543 }
1544 
1545 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1546 _m_pminub (__m64 __A, __m64 __B)
1547 {
1548  return _mm_min_pu8 (__A, __B);
1549 }
1550 
1551 /* Create an 8-bit mask of the signs of 8-bit values. */
1552 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1553 _mm_movemask_pi8 (__m64 __A)
1554 {
1555  unsigned long long p =
1556 #ifdef __LITTLE_ENDIAN__
1557  0x0008101820283038UL; // permute control for sign bits
1558 #else
1559  0x3830282018100800UL; // permute control for sign bits
1560 #endif
1561  return __builtin_bpermd (p, __A);
1562 }
1563 
1564 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1565 _m_pmovmskb (__m64 __A)
1566 {
1567  return _mm_movemask_pi8 (__A);
1568 }
1569 
1570 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1571  in B and produce the high 16 bits of the 32-bit results. */
1572 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1573 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1574 {
1575  __vector unsigned short a, b;
1576  __vector unsigned short c;
1577  __vector unsigned int w0, w1;
1578  __vector unsigned char xform1 = {
1579 #ifdef __LITTLE_ENDIAN__
1580  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1581  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1582 #else
1583  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1584  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1585 #endif
1586  };
1587 
1588  a = (__vector unsigned short)vec_splats (__A);
1589  b = (__vector unsigned short)vec_splats (__B);
1590 
1591  w0 = vec_vmuleuh (a, b);
1592  w1 = vec_vmulouh (a, b);
1593  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1594 
1595  return (__m64) ((__vector long long) c)[0];
1596 }
1597 
1598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1599 _m_pmulhuw (__m64 __A, __m64 __B)
1600 {
1601  return _mm_mulhi_pu16 (__A, __B);
1602 }
1603 
1604 /* Return a combination of the four 16-bit values in A. The selector
1605  must be an immediate. */
1606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1607 _mm_shuffle_pi16 (__m64 __A, int const __N)
1608 {
1609  unsigned long element_selector_10 = __N & 0x03;
1610  unsigned long element_selector_32 = (__N >> 2) & 0x03;
1611  unsigned long element_selector_54 = (__N >> 4) & 0x03;
1612  unsigned long element_selector_76 = (__N >> 6) & 0x03;
1613  static const unsigned short permute_selectors[4] =
1614  {
1615 #ifdef __LITTLE_ENDIAN__
1616  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1617 #else
1618  0x0607, 0x0405, 0x0203, 0x0001
1619 #endif
1620  };
1621  __m64_union t;
1622  __vector unsigned long long a, p, r;
1623 
1624 #ifdef __LITTLE_ENDIAN__
1625  t.as_short[0] = permute_selectors[element_selector_10];
1626  t.as_short[1] = permute_selectors[element_selector_32];
1627  t.as_short[2] = permute_selectors[element_selector_54];
1628  t.as_short[3] = permute_selectors[element_selector_76];
1629 #else
1630  t.as_short[3] = permute_selectors[element_selector_10];
1631  t.as_short[2] = permute_selectors[element_selector_32];
1632  t.as_short[1] = permute_selectors[element_selector_54];
1633  t.as_short[0] = permute_selectors[element_selector_76];
1634 #endif
1635  p = vec_splats (t.as_m64);
1636  a = vec_splats (__A);
1637  r = vec_perm (a, a, (__vector unsigned char)p);
1638  return (__m64) ((__vector long long) r)[0];
1639 }
1640 
1641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642 _m_pshufw (__m64 __A, int const __N)
1643 {
1644  return _mm_shuffle_pi16 (__A, __N);
1645 }
1646 
1647 /* Conditionally store byte elements of A into P. The high bit of each
1648  byte in the selector N determines whether the corresponding byte from
1649  A is stored. */
1650 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1651 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1652 {
1653  __m64 hibit = 0x8080808080808080UL;
1655  __m64 *p = (__m64*)__P;
1656 
1657  tmp = *p;
1658  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1659  tmp = (tmp & (~mask)) | (__A & mask);
1660  *p = tmp;
1661 }
1662 
1663 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1665 {
1666  _mm_maskmove_si64 (__A, __N, __P);
1667 }
1668 
1669 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1671 _mm_avg_pu8 (__m64 __A, __m64 __B)
1672 {
1673  __vector unsigned char a, b, c;
1674 
1675  a = (__vector unsigned char)vec_splats (__A);
1676  b = (__vector unsigned char)vec_splats (__B);
1677  c = vec_avg (a, b);
1678  return (__m64) ((__vector long long) c)[0];
1679 }
1680 
1681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1682 _m_pavgb (__m64 __A, __m64 __B)
1683 {
1684  return _mm_avg_pu8 (__A, __B);
1685 }
1686 
1687 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1688 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1689 _mm_avg_pu16 (__m64 __A, __m64 __B)
1690 {
1691  __vector unsigned short a, b, c;
1692 
1693  a = (__vector unsigned short)vec_splats (__A);
1694  b = (__vector unsigned short)vec_splats (__B);
1695  c = vec_avg (a, b);
1696  return (__m64) ((__vector long long) c)[0];
1697 }
1698 
1699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700 _m_pavgw (__m64 __A, __m64 __B)
1701 {
1702  return _mm_avg_pu16 (__A, __B);
1703 }
1704 
1705 /* Compute the sum of the absolute differences of the unsigned 8-bit
1706  values in A and B. Return the value in the lower 16-bit word; the
1707  upper words are cleared. */
1708 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1709 _mm_sad_pu8 (__m64 __A, __m64 __B)
1710 {
1711  __vector unsigned char a, b;
1712  __vector unsigned char vmin, vmax, vabsdiff;
1713  __vector signed int vsum;
1714  const __vector unsigned int zero =
1715  { 0, 0, 0, 0 };
1716  __m64_union result = {0};
1717 
1718  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1719  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1720  vmin = vec_min (a, b);
1721  vmax = vec_max (a, b);
1722  vabsdiff = vec_sub (vmax, vmin);
1723  /* Sum four groups of bytes into integers. */
1724  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1725  /* Sum across four integers with integer result. */
1726  vsum = vec_sums (vsum, (__vector signed int) zero);
1727  /* The sum is in the right most 32-bits of the vector result.
1728  Transfer to a GPR and truncate to 16 bits. */
1729  result.as_short[0] = vsum[3];
1730  return result.as_m64;
1731 }
1732 
1733 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1734 _m_psadbw (__m64 __A, __m64 __B)
1735 {
1736  return _mm_sad_pu8 (__A, __B);
1737 }
1738 
1739 /* Stores the data in A to the address P without polluting the caches. */
1740 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1741 _mm_stream_pi (__m64 *__P, __m64 __A)
1742 {
1743  /* Use the data cache block touch for store transient. */
1744  __asm__ (
1745  " dcbtstt 0,%0"
1746  :
1747  : "b" (__P)
1748  : "memory"
1749  );
1750  *__P = __A;
1751 }
1752 
1753 /* Likewise. The address must be 16-byte aligned. */
1754 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1755 _mm_stream_ps (float *__P, __m128 __A)
1756 {
1757  /* Use the data cache block touch for store transient. */
1758  __asm__ (
1759  " dcbtstt 0,%0"
1760  :
1761  : "b" (__P)
1762  : "memory"
1763  );
1764  _mm_store_ps (__P, __A);
1765 }
1766 
1767 /* Guarantees that every preceding store is globally visible before
1768  any subsequent store. */
1769 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1770 _mm_sfence (void)
1771 {
1772  /* Generate a light weight sync. */
1773  __atomic_thread_fence (__ATOMIC_RELEASE);
1774 }
1775 
1776 /* The execution of the next instruction is delayed by an implementation
1777  specific amount of time. The instruction does not modify the
1778  architectural state. This is after the pop_options pragma because
1779  it does not require SSE support in the processor--the encoding is a
1780  nop on processors that do not support it. */
1781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1782 _mm_pause (void)
1783 {
1784  /* There is no exact match with this construct, but the following is
1785  close to the desired effect. */
1786 #if _ARCH_PWR8
1787  /* On power8 and later processors we can depend on Program Priority
1788  (PRI) and associated "very low" PPI setting. Since we don't know
1789  what PPI this thread is running at we: 1) save the current PRI
1790  from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1791  via the special or 31,31,31 encoding. 3) issue an "isync" to
1792  insure the PRI change takes effect before we execute any more
1793  instructions.
1794  Now we can execute a lwsync (release barrier) while we execute
1795  this thread at "very low" PRI. Finally we restore the original
1796  PRI and continue execution. */
1797  unsigned long __PPR;
1798 
1799  __asm__ volatile (
1800  " mfppr %0;"
1801  " or 31,31,31;"
1802  " isync;"
1803  " lwsync;"
1804  " isync;"
1805  " mtppr %0;"
1806  : "=r" (__PPR)
1807  :
1808  : "memory"
1809  );
1810 #else
1811  /* For older processor where we may not even have Program Priority
1812  controls we can only depend on Heavy Weight Sync. */
1813  __atomic_thread_fence (__ATOMIC_SEQ_CST);
1814 #endif
1815 }
1816 
1817 /* Transpose the 4x4 matrix composed of row[0-3]. */
1818 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1819 do { \
1820  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1821  __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1822  __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1823  __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1824  __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1825  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1826  (__vector long long)__t1); \
1827  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1828  (__vector long long)__t1); \
1829  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1830  (__vector long long)__t3); \
1831  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1832  (__vector long long)__t3); \
1833 } while (0)
1834 
1835 /* For backward source compatibility. */
1836 //# include <emmintrin.h>
1837 
1838 #endif /* _XMMINTRIN_H_INCLUDED */
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition: altivec.h:11698
__vector signed int vsum
Definition: xmmintrin.h:1713
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1962
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:972
__inline __m128 __m128 __B
Definition: xmmintrin.h:195
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2448
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1563
static const __vector unsigned char permute_vector
Definition: xmmintrin.h:162
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:11472
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition: altivec.h:11837
__vector unsigned int t
Definition: xmmintrin.h:1242
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:325
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1624
#define _m_psadbw
Definition: xmmintrin.h:2996
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition: altivec.h:3446
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:485
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2624
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1490
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1161
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2084
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:92
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:743
return vec_perm((__v4sf) __A,(__v4sf) __B,(__vector unsigned char) t)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:285
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values...
Definition: xmmintrin.h:344
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2250
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1833
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2023
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1700
__inline __m64 int const __D
Definition: xmmintrin.h:1386
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition. ...
Definition: xmmintrin.h:70
float __m128 __attribute__((__vector_size__(16), __may_alias__))
Definition: xmmintrin.h:63
__m64_union
Definition: mmintrin.h:52
return vec_sel(__B, __A, m)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:656
#define _m_pmovmskb
Definition: xmmintrin.h:2990
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:4746
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location...
Definition: xmmintrin.h:2139
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor...
Definition: xmmintrin.h:2103
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2690
__vector float vf4
Definition: xmmintrin.h:1186
const __vector unsigned int zero
Definition: xmmintrin.h:1714
__inline __m128 __m128 int const __mask
Definition: xmmintrin.h:1229
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1920
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2043
__inline int int const __N
Definition: xmmintrin.h:1368
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:588
__a[1]
Definition: xmmintrin.h:1272
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2646
long i
Definition: xmmintrin.h:1456
#define _m_pshufw
Definition: xmmintrin.h:2992
__inline __m128 const float __Y
Definition: xmmintrin.h:131
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1651
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6040
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:9181
c
Definition: xmmintrin.h:409
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2306
a
Definition: xmmintrin.h:407
__vector unsigned long long __p
Definition: xmmintrin.h:1271
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:569
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:113
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1065
__inline __m128 __m64 const * __P
Definition: xmmintrin.h:1269
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2140
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2426
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1776
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:367
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition: altivec.h:4974
static const unsigned int permute_selectors[4]
Definition: xmmintrin.h:1234
#define _m_pavgw
Definition: xmmintrin.h:2995
#define _m_pmaxub
Definition: xmmintrin.h:2987
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2788
#define _m_pmulhuw
Definition: xmmintrin.h:2991
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1412
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition: xmmintrin.h:2361
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:526
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:810
__vector float vf1
Definition: xmmintrin.h:1084
#define _m_pextrw
Definition: xmmintrin.h:2984
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values...
Definition: xmmintrin.h:386
vi4
Definition: xmmintrin.h:1188
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1814
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2668
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:12164
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:833
__m64 tmp
Definition: xmmintrin.h:1654
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2062
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:1983
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1586
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1678
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:947
__tmp
Definition: xmmintrin.h:166
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition: altivec.h:7750
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:135
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:231
__m64 result
Definition: xmmintrin.h:1391
#define _m_pmaxsw
Definition: xmmintrin.h:2986
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:719
#define _m_pminub
Definition: xmmintrin.h:2989
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2231
const __m64 shiftD
Definition: xmmintrin.h:1389
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float]. ...
Definition: xmmintrin.h:1603
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:992
__asm__("vmuleuw %0,%1,%2" :"=v"(result) :"v"(__A), "v"(__B) :)
__vector unsigned char vabsdiff
Definition: xmmintrin.h:1712
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:2924
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1185
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2325
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:808
__inline void __m128 __A
Definition: xmmintrin.h:146
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:1903
return() __m64((__vector long long) c)[0]
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6176
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:266
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:855
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1163
static const __vector unsigned int float_exp_mask
Definition: xmmintrin.h:541
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2407
_mm_hint
Definition: xmmintrin.h:944
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:763
unsigned long element_selector_54
Definition: xmmintrin.h:1232
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:927
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1358
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1625
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:463
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:13651
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:10904
__vector unsigned char vmax
Definition: xmmintrin.h:1712
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2711
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:902
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:4711
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4223
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1210
static const __vector unsigned int mask
Definition: xmmintrin.h:402
vm1
Definition: xmmintrin.h:1086
#define vec_cts
Definition: altivec.h:2922
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1284
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:632
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:214
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1796
__vector unsigned int w1
Definition: xmmintrin.h:1577
#define vec_ctf(__a, __b)
Definition: altivec.h:2893
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:177
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
Definition: xmmintrin.h:1016
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1762
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1468
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:611
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2899
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1513
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1860
#define _m_pavgb
Definition: xmmintrin.h:2994
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:249
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:1968
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:503
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition: xmmintrin.h:2214
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2840
__inline __m128 const float const float const float __W
Definition: xmmintrin.h:132
b
Definition: xmmintrin.h:408
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float]...
Definition: xmmintrin.h:2759
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:545
__vector unsigned long long r
Definition: xmmintrin.h:1622
__inline void enum _mm_hint __I
Definition: xmmintrin.h:958
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1113
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1393
__inline __m128 const float const float __X
Definition: xmmintrin.h:131
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one&#39;s complement of the value...
Definition: xmmintrin.h:426
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1451
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:1888
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:1908
__vector unsigned char xform1
Definition: xmmintrin.h:1578
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:155
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:196
unsigned long element_selector_76
Definition: xmmintrin.h:1233
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2388
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1137
__vector unsigned int w0
Definition: xmmintrin.h:1577
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:677
__builtin_prefetch(__P)
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition: altivec.h:11679
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1159
__vector unsigned long long p
Definition: xmmintrin.h:1622
#define _m_pinsrw
Definition: xmmintrin.h:2985
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition: altivec.h:10219
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1514
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:6688
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition: altivec.h:7562
#define _m_maskmovq
Definition: xmmintrin.h:2993
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:444
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1320
#define _m_pminsw
Definition: xmmintrin.h:2988
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2288
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1041
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2813
_mm_store_ps(__P, __tmp)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
Definition: xmmintrin.h:700
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:404
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:880
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2269
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:115
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1235
unsigned long element_selector_32
Definition: xmmintrin.h:1231
__vector unsigned char vmin
Definition: xmmintrin.h:1712
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition: xmmintrin.h:2183
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition: xmmintrin.h:2603
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1740
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1260
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2729
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1723
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2869
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:4476
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location. ...
Definition: xmmintrin.h:1941
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1374
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:788
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1089
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5127
__vector unsigned int d
Definition: xmmintrin.h:540
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:50
#define _mm_load_ps1(p)
Definition: xmmintrin.h:1709
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2120
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1302
res[0]
Definition: emmintrin.h:1141