clang  16.0.0git
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18  PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE2 scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE2 scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications.
31 */
32 #error \
33  "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
35 
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
38 
39 #if defined(__ppc64__) && \
40  (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41 
42 #include <altivec.h>
43 
44 /* We need definitions from the SSE header files. */
45 #include <xmmintrin.h>
46 
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector long long __v2di;
50 typedef __vector unsigned long long __v2du;
51 typedef __vector int __v4si;
52 typedef __vector unsigned int __v4su;
53 typedef __vector short __v8hi;
54 typedef __vector unsigned short __v8hu;
55 typedef __vector signed char __v16qi;
56 typedef __vector unsigned char __v16qu;
57 
58 /* The Intel API is flexible enough that we must allow aliasing with other
59  vector types, and their scalar components. */
60 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
61 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
62 
63 /* Unaligned version of the same types. */
64 typedef long long __m128i_u
65  __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66 typedef double __m128d_u
67  __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
68 
69 /* Define two value permute mask. */
70 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
71 
72 /* Create a vector with element 0 as F and the rest zero. */
73 extern __inline __m128d
74  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75  _mm_set_sd(double __F) {
76  return __extension__(__m128d){__F, 0.0};
77 }
78 
79 /* Create a vector with both elements equal to F. */
80 extern __inline __m128d
81  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82  _mm_set1_pd(double __F) {
83  return __extension__(__m128d){__F, __F};
84 }
85 
86 extern __inline __m128d
87  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88  _mm_set_pd1(double __F) {
89  return _mm_set1_pd(__F);
90 }
91 
92 /* Create a vector with the lower value X and upper value W. */
93 extern __inline __m128d
94  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95  _mm_set_pd(double __W, double __X) {
96  return __extension__(__m128d){__X, __W};
97 }
98 
99 /* Create a vector with the lower value W and upper value X. */
100 extern __inline __m128d
101  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102  _mm_setr_pd(double __W, double __X) {
103  return __extension__(__m128d){__W, __X};
104 }
105 
106 /* Create an undefined vector. */
107 extern __inline __m128d
108  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109  _mm_undefined_pd(void) {
110  __m128d __Y = __Y;
111  return __Y;
112 }
113 
114 /* Create a vector of zeros. */
115 extern __inline __m128d
116  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117  _mm_setzero_pd(void) {
118  return (__m128d)vec_splats(0);
119 }
120 
121 /* Sets the low DPFP value of A from the low value of B. */
122 extern __inline __m128d
123  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124  _mm_move_sd(__m128d __A, __m128d __B) {
125  __v2df __result = (__v2df)__A;
126  __result[0] = ((__v2df)__B)[0];
127  return (__m128d)__result;
128 }
129 
130 /* Load two DPFP values from P. The address must be 16-byte aligned. */
131 extern __inline __m128d
132  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133  _mm_load_pd(double const *__P) {
134  return ((__m128d)vec_ld(0, (__v16qu *)__P));
135 }
136 
137 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
138 extern __inline __m128d
139  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140  _mm_loadu_pd(double const *__P) {
141  return (vec_vsx_ld(0, __P));
142 }
143 
144 /* Create a vector with all two elements equal to *P. */
145 extern __inline __m128d
146  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147  _mm_load1_pd(double const *__P) {
148  return (vec_splats(*__P));
149 }
150 
151 /* Create a vector with element 0 as *P and the rest zero. */
152 extern __inline __m128d
153  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154  _mm_load_sd(double const *__P) {
155  return _mm_set_sd(*__P);
156 }
157 
158 extern __inline __m128d
159  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160  _mm_load_pd1(double const *__P) {
161  return _mm_load1_pd(__P);
162 }
163 
164 /* Load two DPFP values in reverse order. The address must be aligned. */
165 extern __inline __m128d
166  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167  _mm_loadr_pd(double const *__P) {
168  __v2df __tmp = _mm_load_pd(__P);
169  return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
170 }
171 
172 /* Store two DPFP values. The address must be 16-byte aligned. */
173 extern __inline void
174  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175  _mm_store_pd(double *__P, __m128d __A) {
176  vec_st((__v16qu)__A, 0, (__v16qu *)__P);
177 }
178 
179 /* Store two DPFP values. The address need not be 16-byte aligned. */
180 extern __inline void
181  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182  _mm_storeu_pd(double *__P, __m128d __A) {
183  *(__m128d_u *)__P = __A;
184 }
185 
186 /* Stores the lower DPFP value. */
187 extern __inline void
188  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189  _mm_store_sd(double *__P, __m128d __A) {
190  *__P = ((__v2df)__A)[0];
191 }
192 
193 extern __inline double
194  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195  _mm_cvtsd_f64(__m128d __A) {
196  return ((__v2df)__A)[0];
197 }
198 
199 extern __inline void
200  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201  _mm_storel_pd(double *__P, __m128d __A) {
202  _mm_store_sd(__P, __A);
203 }
204 
205 /* Stores the upper DPFP value. */
206 extern __inline void
207  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208  _mm_storeh_pd(double *__P, __m128d __A) {
209  *__P = ((__v2df)__A)[1];
210 }
211 /* Store the lower DPFP value across two words.
212  The address must be 16-byte aligned. */
213 extern __inline void
214  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215  _mm_store1_pd(double *__P, __m128d __A) {
216  _mm_store_pd(__P, vec_splat(__A, 0));
217 }
218 
219 extern __inline void
220  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221  _mm_store_pd1(double *__P, __m128d __A) {
222  _mm_store1_pd(__P, __A);
223 }
224 
225 /* Store two DPFP values in reverse order. The address must be aligned. */
226 extern __inline void
227  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228  _mm_storer_pd(double *__P, __m128d __A) {
229  _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
230 }
231 
232 /* Intel intrinsic. */
233 extern __inline long long
234  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235  _mm_cvtsi128_si64(__m128i __A) {
236  return ((__v2di)__A)[0];
237 }
238 
239 /* Microsoft intrinsic. */
240 extern __inline long long
241  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242  _mm_cvtsi128_si64x(__m128i __A) {
243  return ((__v2di)__A)[0];
244 }
245 
246 extern __inline __m128d
247  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248  _mm_add_pd(__m128d __A, __m128d __B) {
249  return (__m128d)((__v2df)__A + (__v2df)__B);
250 }
251 
252 /* Add the lower double-precision (64-bit) floating-point element in
253  a and b, store the result in the lower element of dst, and copy
254  the upper element from a to the upper element of dst. */
255 extern __inline __m128d
256  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257  _mm_add_sd(__m128d __A, __m128d __B) {
258  __A[0] = __A[0] + __B[0];
259  return (__A);
260 }
261 
262 extern __inline __m128d
263  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264  _mm_sub_pd(__m128d __A, __m128d __B) {
265  return (__m128d)((__v2df)__A - (__v2df)__B);
266 }
267 
268 extern __inline __m128d
269  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270  _mm_sub_sd(__m128d __A, __m128d __B) {
271  __A[0] = __A[0] - __B[0];
272  return (__A);
273 }
274 
275 extern __inline __m128d
276  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277  _mm_mul_pd(__m128d __A, __m128d __B) {
278  return (__m128d)((__v2df)__A * (__v2df)__B);
279 }
280 
281 extern __inline __m128d
282  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283  _mm_mul_sd(__m128d __A, __m128d __B) {
284  __A[0] = __A[0] * __B[0];
285  return (__A);
286 }
287 
288 extern __inline __m128d
289  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290  _mm_div_pd(__m128d __A, __m128d __B) {
291  return (__m128d)((__v2df)__A / (__v2df)__B);
292 }
293 
294 extern __inline __m128d
295  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296  _mm_div_sd(__m128d __A, __m128d __B) {
297  __A[0] = __A[0] / __B[0];
298  return (__A);
299 }
300 
301 extern __inline __m128d
302  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303  _mm_sqrt_pd(__m128d __A) {
304  return (vec_sqrt(__A));
305 }
306 
307 /* Return pair {sqrt (B[0]), A[1]}. */
308 extern __inline __m128d
309  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310  _mm_sqrt_sd(__m128d __A, __m128d __B) {
311  __v2df __c;
312  __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
313  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
314 }
315 
316 extern __inline __m128d
317  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318  _mm_min_pd(__m128d __A, __m128d __B) {
319  return (vec_min(__A, __B));
320 }
321 
322 extern __inline __m128d
323  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324  _mm_min_sd(__m128d __A, __m128d __B) {
325  __v2df __a, __b, __c;
326  __a = vec_splats(__A[0]);
327  __b = vec_splats(__B[0]);
328  __c = vec_min(__a, __b);
329  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
330 }
331 
332 extern __inline __m128d
333  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334  _mm_max_pd(__m128d __A, __m128d __B) {
335  return (vec_max(__A, __B));
336 }
337 
338 extern __inline __m128d
339  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340  _mm_max_sd(__m128d __A, __m128d __B) {
341  __v2df __a, __b, __c;
342  __a = vec_splats(__A[0]);
343  __b = vec_splats(__B[0]);
344  __c = vec_max(__a, __b);
345  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
346 }
347 
348 extern __inline __m128d
349  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350  _mm_cmpeq_pd(__m128d __A, __m128d __B) {
351  return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
352 }
353 
354 extern __inline __m128d
355  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356  _mm_cmplt_pd(__m128d __A, __m128d __B) {
357  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
358 }
359 
360 extern __inline __m128d
361  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362  _mm_cmple_pd(__m128d __A, __m128d __B) {
363  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
364 }
365 
366 extern __inline __m128d
367  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368  _mm_cmpgt_pd(__m128d __A, __m128d __B) {
369  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
370 }
371 
372 extern __inline __m128d
373  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374  _mm_cmpge_pd(__m128d __A, __m128d __B) {
375  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
376 }
377 
378 extern __inline __m128d
379  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380  _mm_cmpneq_pd(__m128d __A, __m128d __B) {
381  __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
382  return ((__m128d)vec_nor(__temp, __temp));
383 }
384 
385 extern __inline __m128d
386  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387  _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
388  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
389 }
390 
391 extern __inline __m128d
392  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393  _mm_cmpnle_pd(__m128d __A, __m128d __B) {
394  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
395 }
396 
397 extern __inline __m128d
398  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399  _mm_cmpngt_pd(__m128d __A, __m128d __B) {
400  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
401 }
402 
403 extern __inline __m128d
404  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405  _mm_cmpnge_pd(__m128d __A, __m128d __B) {
406  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
407 }
408 
409 extern __inline __m128d
410  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411  _mm_cmpord_pd(__m128d __A, __m128d __B) {
412  __v2du __c, __d;
413  /* Compare against self will return false (0's) if NAN. */
414  __c = (__v2du)vec_cmpeq(__A, __A);
415  __d = (__v2du)vec_cmpeq(__B, __B);
416  /* A != NAN and B != NAN. */
417  return ((__m128d)vec_and(__c, __d));
418 }
419 
420 extern __inline __m128d
421  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422  _mm_cmpunord_pd(__m128d __A, __m128d __B) {
423 #if _ARCH_PWR8
424  __v2du __c, __d;
425  /* Compare against self will return false (0's) if NAN. */
426  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
427  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
428  /* A == NAN OR B == NAN converts too:
429  NOT(A != NAN) OR NOT(B != NAN). */
430  __c = vec_nor(__c, __c);
431  return ((__m128d)vec_orc(__c, __d));
432 #else
433  __v2du __c, __d;
434  /* Compare against self will return false (0's) if NAN. */
435  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
436  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
437  /* Convert the true ('1's) is NAN. */
438  __c = vec_nor(__c, __c);
439  __d = vec_nor(__d, __d);
440  return ((__m128d)vec_or(__c, __d));
441 #endif
442 }
443 
444 extern __inline __m128d
445  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446  _mm_cmpeq_sd(__m128d __A, __m128d __B) {
447  __v2df __a, __b, __c;
448  /* PowerISA VSX does not allow partial (for just lower double)
449  results. So to insure we don't generate spurious exceptions
450  (from the upper double values) we splat the lower double
451  before we do the operation. */
452  __a = vec_splats(__A[0]);
453  __b = vec_splats(__B[0]);
454  __c = (__v2df)vec_cmpeq(__a, __b);
455  /* Then we merge the lower double result with the original upper
456  double from __A. */
457  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
458 }
459 
460 extern __inline __m128d
461  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462  _mm_cmplt_sd(__m128d __A, __m128d __B) {
463  __v2df __a, __b, __c;
464  __a = vec_splats(__A[0]);
465  __b = vec_splats(__B[0]);
466  __c = (__v2df)vec_cmplt(__a, __b);
467  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
468 }
469 
470 extern __inline __m128d
471  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472  _mm_cmple_sd(__m128d __A, __m128d __B) {
473  __v2df __a, __b, __c;
474  __a = vec_splats(__A[0]);
475  __b = vec_splats(__B[0]);
476  __c = (__v2df)vec_cmple(__a, __b);
477  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
478 }
479 
480 extern __inline __m128d
481  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482  _mm_cmpgt_sd(__m128d __A, __m128d __B) {
483  __v2df __a, __b, __c;
484  __a = vec_splats(__A[0]);
485  __b = vec_splats(__B[0]);
486  __c = (__v2df)vec_cmpgt(__a, __b);
487  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
488 }
489 
490 extern __inline __m128d
491  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492  _mm_cmpge_sd(__m128d __A, __m128d __B) {
493  __v2df __a, __b, __c;
494  __a = vec_splats(__A[0]);
495  __b = vec_splats(__B[0]);
496  __c = (__v2df)vec_cmpge(__a, __b);
497  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
498 }
499 
500 extern __inline __m128d
501  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502  _mm_cmpneq_sd(__m128d __A, __m128d __B) {
503  __v2df __a, __b, __c;
504  __a = vec_splats(__A[0]);
505  __b = vec_splats(__B[0]);
506  __c = (__v2df)vec_cmpeq(__a, __b);
507  __c = vec_nor(__c, __c);
508  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
509 }
510 
511 extern __inline __m128d
512  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513  _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
514  __v2df __a, __b, __c;
515  __a = vec_splats(__A[0]);
516  __b = vec_splats(__B[0]);
517  /* Not less than is just greater than or equal. */
518  __c = (__v2df)vec_cmpge(__a, __b);
519  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
520 }
521 
522 extern __inline __m128d
523  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524  _mm_cmpnle_sd(__m128d __A, __m128d __B) {
525  __v2df __a, __b, __c;
526  __a = vec_splats(__A[0]);
527  __b = vec_splats(__B[0]);
528  /* Not less than or equal is just greater than. */
529  __c = (__v2df)vec_cmpge(__a, __b);
530  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
531 }
532 
533 extern __inline __m128d
534  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535  _mm_cmpngt_sd(__m128d __A, __m128d __B) {
536  __v2df __a, __b, __c;
537  __a = vec_splats(__A[0]);
538  __b = vec_splats(__B[0]);
539  /* Not greater than is just less than or equal. */
540  __c = (__v2df)vec_cmple(__a, __b);
541  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
542 }
543 
544 extern __inline __m128d
545  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546  _mm_cmpnge_sd(__m128d __A, __m128d __B) {
547  __v2df __a, __b, __c;
548  __a = vec_splats(__A[0]);
549  __b = vec_splats(__B[0]);
550  /* Not greater than or equal is just less than. */
551  __c = (__v2df)vec_cmplt(__a, __b);
552  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
553 }
554 
555 extern __inline __m128d
556  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557  _mm_cmpord_sd(__m128d __A, __m128d __B) {
558  __v2df __r;
559  __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
560  return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
561 }
562 
563 extern __inline __m128d
564  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565  _mm_cmpunord_sd(__m128d __A, __m128d __B) {
566  __v2df __r;
567  __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
568  return (__m128d)_mm_setr_pd(__r[0], __A[1]);
569 }
570 
571 /* FIXME
572  The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573  exactly the same because GCC for PowerPC only generates unordered
574  compares (scalar and vector).
575  Technically __mm_comieq_sp et all should be using the ordered
576  compare and signal for QNaNs. The __mm_ucomieq_sd et all should
577  be OK. */
578 extern __inline int
579  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580  _mm_comieq_sd(__m128d __A, __m128d __B) {
581  return (__A[0] == __B[0]);
582 }
583 
584 extern __inline int
585  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586  _mm_comilt_sd(__m128d __A, __m128d __B) {
587  return (__A[0] < __B[0]);
588 }
589 
590 extern __inline int
591  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592  _mm_comile_sd(__m128d __A, __m128d __B) {
593  return (__A[0] <= __B[0]);
594 }
595 
596 extern __inline int
597  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598  _mm_comigt_sd(__m128d __A, __m128d __B) {
599  return (__A[0] > __B[0]);
600 }
601 
602 extern __inline int
603  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604  _mm_comige_sd(__m128d __A, __m128d __B) {
605  return (__A[0] >= __B[0]);
606 }
607 
608 extern __inline int
609  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610  _mm_comineq_sd(__m128d __A, __m128d __B) {
611  return (__A[0] != __B[0]);
612 }
613 
614 extern __inline int
615  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616  _mm_ucomieq_sd(__m128d __A, __m128d __B) {
617  return (__A[0] == __B[0]);
618 }
619 
620 extern __inline int
621  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622  _mm_ucomilt_sd(__m128d __A, __m128d __B) {
623  return (__A[0] < __B[0]);
624 }
625 
626 extern __inline int
627  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628  _mm_ucomile_sd(__m128d __A, __m128d __B) {
629  return (__A[0] <= __B[0]);
630 }
631 
632 extern __inline int
633  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634  _mm_ucomigt_sd(__m128d __A, __m128d __B) {
635  return (__A[0] > __B[0]);
636 }
637 
638 extern __inline int
639  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640  _mm_ucomige_sd(__m128d __A, __m128d __B) {
641  return (__A[0] >= __B[0]);
642 }
643 
644 extern __inline int
645  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646  _mm_ucomineq_sd(__m128d __A, __m128d __B) {
647  return (__A[0] != __B[0]);
648 }
649 
650 /* Create a vector of Qi, where i is the element number. */
651 extern __inline __m128i
652  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653  _mm_set_epi64x(long long __q1, long long __q0) {
654  return __extension__(__m128i)(__v2di){__q0, __q1};
655 }
656 
657 extern __inline __m128i
658  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659  _mm_set_epi64(__m64 __q1, __m64 __q0) {
660  return _mm_set_epi64x((long long)__q1, (long long)__q0);
661 }
662 
663 extern __inline __m128i
664  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665  _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
666  return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
667 }
668 
669 extern __inline __m128i
670  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671  _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
672  short __q2, short __q1, short __q0) {
673  return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
674  __q4, __q5, __q6, __q7};
675 }
676 
677 extern __inline __m128i
678  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679  _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
680  char __q10, char __q09, char __q08, char __q07, char __q06,
681  char __q05, char __q04, char __q03, char __q02, char __q01,
682  char __q00) {
683  return __extension__(__m128i)(__v16qi){
684  __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
685  __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
686 }
687 
688 /* Set all of the elements of the vector to A. */
689 extern __inline __m128i
690  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691  _mm_set1_epi64x(long long __A) {
692  return _mm_set_epi64x(__A, __A);
693 }
694 
695 extern __inline __m128i
696  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697  _mm_set1_epi64(__m64 __A) {
698  return _mm_set_epi64(__A, __A);
699 }
700 
701 extern __inline __m128i
702  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703  _mm_set1_epi32(int __A) {
704  return _mm_set_epi32(__A, __A, __A, __A);
705 }
706 
707 extern __inline __m128i
708  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709  _mm_set1_epi16(short __A) {
710  return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
711 }
712 
713 extern __inline __m128i
714  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715  _mm_set1_epi8(char __A) {
716  return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
717  __A, __A, __A, __A, __A);
718 }
719 
720 /* Create a vector of Qi, where i is the element number.
721  The parameter order is reversed from the _mm_set_epi* functions. */
722 extern __inline __m128i
723  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724  _mm_setr_epi64(__m64 __q0, __m64 __q1) {
725  return _mm_set_epi64(__q1, __q0);
726 }
727 
728 extern __inline __m128i
729  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730  _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
731  return _mm_set_epi32(__q3, __q2, __q1, __q0);
732 }
733 
734 extern __inline __m128i
735  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736  _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
737  short __q5, short __q6, short __q7) {
738  return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
739 }
740 
741 extern __inline __m128i
742  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743  _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
744  char __q05, char __q06, char __q07, char __q08, char __q09,
745  char __q10, char __q11, char __q12, char __q13, char __q14,
746  char __q15) {
747  return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
748  __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
749 }
750 
751 /* Create a vector with element 0 as *P and the rest zero. */
752 extern __inline __m128i
753  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754  _mm_load_si128(__m128i const *__P) {
755  return *__P;
756 }
757 
758 extern __inline __m128i
759  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760  _mm_loadu_si128(__m128i_u const *__P) {
761  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
762 }
763 
764 extern __inline __m128i
765  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766  _mm_loadl_epi64(__m128i_u const *__P) {
767  return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
768 }
769 
770 extern __inline void
771  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772  _mm_store_si128(__m128i *__P, __m128i __B) {
773  vec_st((__v16qu)__B, 0, (__v16qu *)__P);
774 }
775 
776 extern __inline void
777  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778  _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
779  *__P = __B;
780 }
781 
782 extern __inline void
783  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784  _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
785  *(long long *)__P = ((__v2di)__B)[0];
786 }
787 
788 extern __inline __m64
789  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790  _mm_movepi64_pi64(__m128i_u __B) {
791  return (__m64)((__v2di)__B)[0];
792 }
793 
794 extern __inline __m128i
795  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796  _mm_movpi64_epi64(__m64 __A) {
797  return _mm_set_epi64((__m64)0LL, __A);
798 }
799 
800 extern __inline __m128i
801  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802  _mm_move_epi64(__m128i __A) {
803  return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
804 }
805 
806 /* Create an undefined vector. */
807 extern __inline __m128i
808  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809  _mm_undefined_si128(void) {
810  __m128i __Y = __Y;
811  return __Y;
812 }
813 
814 /* Create a vector of zeros. */
815 extern __inline __m128i
816  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817  _mm_setzero_si128(void) {
818  return __extension__(__m128i)(__v4si){0, 0, 0, 0};
819 }
820 
821 #ifdef _ARCH_PWR8
822 extern __inline __m128d
823  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824  _mm_cvtepi32_pd(__m128i __A) {
825  __v2di __val;
826  /* For LE need to generate Vector Unpack Low Signed Word.
827  Which is generated from unpackh. */
828  __val = (__v2di)vec_unpackh((__v4si)__A);
829 
830  return (__m128d)vec_ctf(__val, 0);
831 }
832 #endif
833 
834 extern __inline __m128
835  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836  _mm_cvtepi32_ps(__m128i __A) {
837  return ((__m128)vec_ctf((__v4si)__A, 0));
838 }
839 
840 extern __inline __m128i
841  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842  _mm_cvtpd_epi32(__m128d __A) {
843  __v2df __rounded = vec_rint(__A);
844  __v4si __result, __temp;
845  const __v4si __vzero = {0, 0, 0, 0};
846 
847  /* VSX Vector truncate Double-Precision to integer and Convert to
848  Signed Integer Word format with Saturate. */
849  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
850 
851 #ifdef _ARCH_PWR8
852 #ifdef __LITTLE_ENDIAN__
853  __temp = vec_mergeo(__temp, __temp);
854 #else
855  __temp = vec_mergee(__temp, __temp);
856 #endif
857  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
858  (__vector long long)__vzero);
859 #else
860  {
861  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863  __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
864  }
865 #endif
866  return (__m128i)__result;
867 }
868 
869 extern __inline __m64
870  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871  _mm_cvtpd_pi32(__m128d __A) {
872  __m128i __result = _mm_cvtpd_epi32(__A);
873 
874  return (__m64)__result[0];
875 }
876 
877 extern __inline __m128
878  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879  _mm_cvtpd_ps(__m128d __A) {
880  __v4sf __result;
881  __v4si __temp;
882  const __v4si __vzero = {0, 0, 0, 0};
883 
884  __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
885 
886 #ifdef _ARCH_PWR8
887 #ifdef __LITTLE_ENDIAN__
888  __temp = vec_mergeo(__temp, __temp);
889 #else
890  __temp = vec_mergee(__temp, __temp);
891 #endif
892  __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
893  (__vector long long)__vzero);
894 #else
895  {
896  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898  __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
899  }
900 #endif
901  return ((__m128)__result);
902 }
903 
904 extern __inline __m128i
905  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906  _mm_cvttpd_epi32(__m128d __A) {
907  __v4si __result;
908  __v4si __temp;
909  const __v4si __vzero = {0, 0, 0, 0};
910 
911  /* VSX Vector truncate Double-Precision to integer and Convert to
912  Signed Integer Word format with Saturate. */
913  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
914 
915 #ifdef _ARCH_PWR8
916 #ifdef __LITTLE_ENDIAN__
917  __temp = vec_mergeo(__temp, __temp);
918 #else
919  __temp = vec_mergee(__temp, __temp);
920 #endif
921  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
922  (__vector long long)__vzero);
923 #else
924  {
925  const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927  __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
928  }
929 #endif
930 
931  return ((__m128i)__result);
932 }
933 
934 extern __inline __m64
935  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
936  _mm_cvttpd_pi32(__m128d __A) {
937  __m128i __result = _mm_cvttpd_epi32(__A);
938 
939  return (__m64)__result[0];
940 }
941 
942 extern __inline int
943  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
944  _mm_cvtsi128_si32(__m128i __A) {
945  return ((__v4si)__A)[0];
946 }
947 
948 #ifdef _ARCH_PWR8
949 extern __inline __m128d
950  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951  _mm_cvtpi32_pd(__m64 __A) {
952  __v4si __temp;
953  __v2di __tmp2;
954  __v2df __result;
955 
956  __temp = (__v4si)vec_splats(__A);
957  __tmp2 = (__v2di)vec_unpackl(__temp);
958  __result = vec_ctf((__vector signed long long)__tmp2, 0);
959  return (__m128d)__result;
960 }
961 #endif
962 
963 extern __inline __m128i
964  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965  _mm_cvtps_epi32(__m128 __A) {
966  __v4sf __rounded;
967  __v4si __result;
968 
969  __rounded = vec_rint((__v4sf)__A);
970  __result = vec_cts(__rounded, 0);
971  return (__m128i)__result;
972 }
973 
974 extern __inline __m128i
975  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976  _mm_cvttps_epi32(__m128 __A) {
977  __v4si __result;
978 
979  __result = vec_cts((__v4sf)__A, 0);
980  return (__m128i)__result;
981 }
982 
983 extern __inline __m128d
984  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985  _mm_cvtps_pd(__m128 __A) {
986  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
987 #ifdef vec_doubleh
988  return (__m128d)vec_doubleh((__v4sf)__A);
989 #else
990  /* Otherwise the compiler is not current and so need to generate the
991  equivalent code. */
992  __v4sf __a = (__v4sf)__A;
993  __v4sf __temp;
994  __v2df __result;
995 #ifdef __LITTLE_ENDIAN__
996  /* The input float values are in elements {[0], [1]} but the convert
997  instruction needs them in elements {[1], [3]}, So we use two
998  shift left double vector word immediates to get the elements
999  lined up. */
1000  __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001  __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002 #else
1003  /* The input float values are in elements {[0], [1]} but the convert
1004  instruction needs them in elements {[0], [2]}, So we use two
1005  shift left double vector word immediates to get the elements
1006  lined up. */
1007  __temp = vec_vmrghw(__a, __a);
1008 #endif
1009  __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010  return (__m128d)__result;
1011 #endif
1012 }
1013 
1014 extern __inline int
1015  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016  _mm_cvtsd_si32(__m128d __A) {
1017  __v2df __rounded = vec_rint((__v2df)__A);
1018  int __result = ((__v2df)__rounded)[0];
1019 
1020  return __result;
1021 }
1022 /* Intel intrinsic. */
1023 extern __inline long long
1024  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025  _mm_cvtsd_si64(__m128d __A) {
1026  __v2df __rounded = vec_rint((__v2df)__A);
1027  long long __result = ((__v2df)__rounded)[0];
1028 
1029  return __result;
1030 }
1031 
1032 /* Microsoft intrinsic. */
1033 extern __inline long long
1034  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035  _mm_cvtsd_si64x(__m128d __A) {
1036  return _mm_cvtsd_si64((__v2df)__A);
1037 }
1038 
1039 extern __inline int
1040  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041  _mm_cvttsd_si32(__m128d __A) {
1042  int __result = ((__v2df)__A)[0];
1043 
1044  return __result;
1045 }
1046 
1047 /* Intel intrinsic. */
1048 extern __inline long long
1049  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050  _mm_cvttsd_si64(__m128d __A) {
1051  long long __result = ((__v2df)__A)[0];
1052 
1053  return __result;
1054 }
1055 
1056 /* Microsoft intrinsic. */
1057 extern __inline long long
1058  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059  _mm_cvttsd_si64x(__m128d __A) {
1060  return _mm_cvttsd_si64(__A);
1061 }
1062 
1063 extern __inline __m128
1064  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065  _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066  __v4sf __result = (__v4sf)__A;
1067 
1068 #ifdef __LITTLE_ENDIAN__
1069  __v4sf __temp_s;
1070  /* Copy double element[0] to element [1] for conversion. */
1071  __v2df __temp_b = vec_splat((__v2df)__B, 0);
1072 
1073  /* Pre-rotate __A left 3 (logically right 1) elements. */
1074  __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075  /* Convert double to single float scalar in a vector. */
1076  __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077  /* Shift the resulting scalar into vector element [0]. */
1078  __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079 #else
1080  __result[0] = ((__v2df)__B)[0];
1081 #endif
1082  return (__m128)__result;
1083 }
1084 
1085 extern __inline __m128d
1086  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087  _mm_cvtsi32_sd(__m128d __A, int __B) {
1088  __v2df __result = (__v2df)__A;
1089  double __db = __B;
1090  __result[0] = __db;
1091  return (__m128d)__result;
1092 }
1093 
1094 /* Intel intrinsic. */
1095 extern __inline __m128d
1096  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097  _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098  __v2df __result = (__v2df)__A;
1099  double __db = __B;
1100  __result[0] = __db;
1101  return (__m128d)__result;
1102 }
1103 
1104 /* Microsoft intrinsic. */
1105 extern __inline __m128d
1106  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107  _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108  return _mm_cvtsi64_sd(__A, __B);
1109 }
1110 
1111 extern __inline __m128d
1112  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113  _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114 #ifdef __LITTLE_ENDIAN__
1115  /* Use splat to move element [0] into position for the convert. */
1116  __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117  __v2df __res;
1118  /* Convert single float scalar to double in a vector. */
1119  __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120  return (__m128d)vec_mergel(__res, (__v2df)__A);
1121 #else
1122  __v2df __res = (__v2df)__A;
1123  __res[0] = ((__v4sf)__B)[0];
1124  return (__m128d)__res;
1125 #endif
1126 }
1127 
1128 extern __inline __m128d
1129  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130  _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131  __vector double __result;
1132  const int __litmsk = __mask & 0x3;
1133 
1134  if (__litmsk == 0)
1135  __result = vec_mergeh(__A, __B);
1136 #if __GNUC__ < 6
1137  else if (__litmsk == 1)
1138  __result = vec_xxpermdi(__B, __A, 2);
1139  else if (__litmsk == 2)
1140  __result = vec_xxpermdi(__B, __A, 1);
1141 #else
1142  else if (__litmsk == 1)
1143  __result = vec_xxpermdi(__A, __B, 2);
1144  else if (__litmsk == 2)
1145  __result = vec_xxpermdi(__A, __B, 1);
1146 #endif
1147  else
1148  __result = vec_mergel(__A, __B);
1149 
1150  return __result;
1151 }
1152 
1153 extern __inline __m128d
1154  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155  _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156  return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1157 }
1158 
1159 extern __inline __m128d
1160  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161  _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162  return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1163 }
1164 
1165 extern __inline __m128d
1166  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167  _mm_loadh_pd(__m128d __A, double const *__B) {
1168  __v2df __result = (__v2df)__A;
1169  __result[1] = *__B;
1170  return (__m128d)__result;
1171 }
1172 
1173 extern __inline __m128d
1174  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175  _mm_loadl_pd(__m128d __A, double const *__B) {
1176  __v2df __result = (__v2df)__A;
1177  __result[0] = *__B;
1178  return (__m128d)__result;
1179 }
1180 
1181 #ifdef _ARCH_PWR8
1182 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1183 
1184 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1185 extern __inline int
1186  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187  _mm_movemask_pd(__m128d __A) {
1188 #ifdef _ARCH_PWR10
1189  return vec_extractm((__v2du)__A);
1190 #else
1191  __vector unsigned long long __result;
1192  static const __vector unsigned int __perm_mask = {
1193 #ifdef __LITTLE_ENDIAN__
1194  0x80800040, 0x80808080, 0x80808080, 0x80808080
1195 #else
1196  0x80808080, 0x80808080, 0x80808080, 0x80804000
1197 #endif
1198  };
1199 
1200  __result = ((__vector unsigned long long)vec_vbpermq(
1201  (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1202 
1203 #ifdef __LITTLE_ENDIAN__
1204  return __result[1];
1205 #else
1206  return __result[0];
1207 #endif
1208 #endif /* !_ARCH_PWR10 */
1209 }
1210 #endif /* _ARCH_PWR8 */
1211 
1212 extern __inline __m128i
1213  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214  _mm_packs_epi16(__m128i __A, __m128i __B) {
1215  return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1216 }
1217 
1218 extern __inline __m128i
1219  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220  _mm_packs_epi32(__m128i __A, __m128i __B) {
1221  return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1222 }
1223 
1224 extern __inline __m128i
1225  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226  _mm_packus_epi16(__m128i __A, __m128i __B) {
1227  return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1228 }
1229 
1230 extern __inline __m128i
1231  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232  _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233  return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1234 }
1235 
1236 extern __inline __m128i
1237  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238  _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239  return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1240 }
1241 
1242 extern __inline __m128i
1243  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244  _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245  return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1246 }
1247 
1248 extern __inline __m128i
1249  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250  _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251  return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1252 }
1253 
1254 extern __inline __m128i
1255  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256  _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257  return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1258 }
1259 
1260 extern __inline __m128i
1261  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262  _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263  return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1264 }
1265 
1266 extern __inline __m128i
1267  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268  _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269  return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1270 }
1271 
1272 extern __inline __m128i
1273  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274  _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275  return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1276 }
1277 
1278 extern __inline __m128i
1279  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280  _mm_add_epi8(__m128i __A, __m128i __B) {
1281  return (__m128i)((__v16qu)__A + (__v16qu)__B);
1282 }
1283 
1284 extern __inline __m128i
1285  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286  _mm_add_epi16(__m128i __A, __m128i __B) {
1287  return (__m128i)((__v8hu)__A + (__v8hu)__B);
1288 }
1289 
1290 extern __inline __m128i
1291  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292  _mm_add_epi32(__m128i __A, __m128i __B) {
1293  return (__m128i)((__v4su)__A + (__v4su)__B);
1294 }
1295 
1296 extern __inline __m128i
1297  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298  _mm_add_epi64(__m128i __A, __m128i __B) {
1299  return (__m128i)((__v2du)__A + (__v2du)__B);
1300 }
1301 
1302 extern __inline __m128i
1303  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304  _mm_adds_epi8(__m128i __A, __m128i __B) {
1305  return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1306 }
1307 
1308 extern __inline __m128i
1309  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310  _mm_adds_epi16(__m128i __A, __m128i __B) {
1311  return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1312 }
1313 
1314 extern __inline __m128i
1315  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316  _mm_adds_epu8(__m128i __A, __m128i __B) {
1317  return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1318 }
1319 
1320 extern __inline __m128i
1321  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322  _mm_adds_epu16(__m128i __A, __m128i __B) {
1323  return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1324 }
1325 
1326 extern __inline __m128i
1327  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328  _mm_sub_epi8(__m128i __A, __m128i __B) {
1329  return (__m128i)((__v16qu)__A - (__v16qu)__B);
1330 }
1331 
1332 extern __inline __m128i
1333  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334  _mm_sub_epi16(__m128i __A, __m128i __B) {
1335  return (__m128i)((__v8hu)__A - (__v8hu)__B);
1336 }
1337 
1338 extern __inline __m128i
1339  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340  _mm_sub_epi32(__m128i __A, __m128i __B) {
1341  return (__m128i)((__v4su)__A - (__v4su)__B);
1342 }
1343 
1344 extern __inline __m128i
1345  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346  _mm_sub_epi64(__m128i __A, __m128i __B) {
1347  return (__m128i)((__v2du)__A - (__v2du)__B);
1348 }
1349 
1350 extern __inline __m128i
1351  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352  _mm_subs_epi8(__m128i __A, __m128i __B) {
1353  return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1354 }
1355 
1356 extern __inline __m128i
1357  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358  _mm_subs_epi16(__m128i __A, __m128i __B) {
1359  return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1360 }
1361 
1362 extern __inline __m128i
1363  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364  _mm_subs_epu8(__m128i __A, __m128i __B) {
1365  return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1366 }
1367 
1368 extern __inline __m128i
1369  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370  _mm_subs_epu16(__m128i __A, __m128i __B) {
1371  return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1372 }
1373 
1374 extern __inline __m128i
1375  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376  _mm_madd_epi16(__m128i __A, __m128i __B) {
1377  __vector signed int __zero = {0, 0, 0, 0};
1378 
1379  return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1380 }
1381 
1382 extern __inline __m128i
1383  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384  _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385  __vector signed int __w0, __w1;
1386 
1387  __vector unsigned char __xform1 = {
1388 #ifdef __LITTLE_ENDIAN__
1389  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391 #else
1392  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393  0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394 #endif
1395  };
1396 
1397  __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398  __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399  return (__m128i)vec_perm(__w0, __w1, __xform1);
1400 }
1401 
1402 extern __inline __m128i
1403  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404  _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405  return (__m128i)((__v8hi)__A * (__v8hi)__B);
1406 }
1407 
1408 extern __inline __m64
1409  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410  _mm_mul_su32(__m64 __A, __m64 __B) {
1411  unsigned int __a = __A;
1412  unsigned int __b = __B;
1413 
1414  return ((__m64)__a * (__m64)__b);
1415 }
1416 
1417 #ifdef _ARCH_PWR8
1418 extern __inline __m128i
1419  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420  _mm_mul_epu32(__m128i __A, __m128i __B) {
1421 #if __GNUC__ < 8
1422  __v2du __result;
1423 
1424 #ifdef __LITTLE_ENDIAN__
1425  /* VMX Vector Multiply Odd Unsigned Word. */
1426  __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427 #else
1428  /* VMX Vector Multiply Even Unsigned Word. */
1429  __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430 #endif
1431  return (__m128i)__result;
1432 #else
1433  return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434 #endif
1435 }
1436 #endif
1437 
1438 extern __inline __m128i
1439  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440  _mm_slli_epi16(__m128i __A, int __B) {
1441  __v8hu __lshift;
1442  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1443 
1444  if (__B >= 0 && __B < 16) {
1445  if (__builtin_constant_p(__B))
1446  __lshift = (__v8hu)vec_splat_s16(__B);
1447  else
1448  __lshift = vec_splats((unsigned short)__B);
1449 
1450  __result = vec_sl((__v8hi)__A, __lshift);
1451  }
1452 
1453  return (__m128i)__result;
1454 }
1455 
1456 extern __inline __m128i
1457  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458  _mm_slli_epi32(__m128i __A, int __B) {
1459  __v4su __lshift;
1460  __v4si __result = {0, 0, 0, 0};
1461 
1462  if (__B >= 0 && __B < 32) {
1463  if (__builtin_constant_p(__B) && __B < 16)
1464  __lshift = (__v4su)vec_splat_s32(__B);
1465  else
1466  __lshift = vec_splats((unsigned int)__B);
1467 
1468  __result = vec_sl((__v4si)__A, __lshift);
1469  }
1470 
1471  return (__m128i)__result;
1472 }
1473 
1474 #ifdef _ARCH_PWR8
1475 extern __inline __m128i
1476  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477  _mm_slli_epi64(__m128i __A, int __B) {
1478  __v2du __lshift;
1479  __v2di __result = {0, 0};
1480 
1481  if (__B >= 0 && __B < 64) {
1482  if (__builtin_constant_p(__B) && __B < 16)
1483  __lshift = (__v2du)vec_splat_s32(__B);
1484  else
1485  __lshift = (__v2du)vec_splats((unsigned int)__B);
1486 
1487  __result = vec_sl((__v2di)__A, __lshift);
1488  }
1489 
1490  return (__m128i)__result;
1491 }
1492 #endif
1493 
1494 extern __inline __m128i
1495  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496  _mm_srai_epi16(__m128i __A, int __B) {
1497  __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498  __v8hi __result;
1499 
1500  if (__B < 16) {
1501  if (__builtin_constant_p(__B))
1502  __rshift = (__v8hu)vec_splat_s16(__B);
1503  else
1504  __rshift = vec_splats((unsigned short)__B);
1505  }
1506  __result = vec_sra((__v8hi)__A, __rshift);
1507 
1508  return (__m128i)__result;
1509 }
1510 
1511 extern __inline __m128i
1512  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513  _mm_srai_epi32(__m128i __A, int __B) {
1514  __v4su __rshift = {31, 31, 31, 31};
1515  __v4si __result;
1516 
1517  if (__B < 32) {
1518  if (__builtin_constant_p(__B)) {
1519  if (__B < 16)
1520  __rshift = (__v4su)vec_splat_s32(__B);
1521  else
1522  __rshift = (__v4su)vec_splats((unsigned int)__B);
1523  } else
1524  __rshift = vec_splats((unsigned int)__B);
1525  }
1526  __result = vec_sra((__v4si)__A, __rshift);
1527 
1528  return (__m128i)__result;
1529 }
1530 
1531 extern __inline __m128i
1532  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533  _mm_bslli_si128(__m128i __A, const int __N) {
1534  __v16qu __result;
1535  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1536 
1537  if (__N < 16)
1538  __result = vec_sld((__v16qu)__A, __zeros, __N);
1539  else
1540  __result = __zeros;
1541 
1542  return (__m128i)__result;
1543 }
1544 
1545 extern __inline __m128i
1546  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547  _mm_bsrli_si128(__m128i __A, const int __N) {
1548  __v16qu __result;
1549  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1550 
1551  if (__N < 16)
1552 #ifdef __LITTLE_ENDIAN__
1553  if (__builtin_constant_p(__N))
1554  /* Would like to use Vector Shift Left Double by Octet
1555  Immediate here to use the immediate form and avoid
1556  load of __N * 8 value into a separate VR. */
1557  __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558  else
1559 #endif
1560  {
1561  __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562 #ifdef __LITTLE_ENDIAN__
1563  __result = vec_sro((__v16qu)__A, __shift);
1564 #else
1565  __result = vec_slo((__v16qu)__A, __shift);
1566 #endif
1567  }
1568  else
1569  __result = __zeros;
1570 
1571  return (__m128i)__result;
1572 }
1573 
1574 extern __inline __m128i
1575  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576  _mm_srli_si128(__m128i __A, const int __N) {
1577  return _mm_bsrli_si128(__A, __N);
1578 }
1579 
1580 extern __inline __m128i
1581  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582  _mm_slli_si128(__m128i __A, const int _imm5) {
1583  __v16qu __result;
1584  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1585 
1586  if (_imm5 < 16)
1587 #ifdef __LITTLE_ENDIAN__
1588  __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589 #else
1590  __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591 #endif
1592  else
1593  __result = __zeros;
1594 
1595  return (__m128i)__result;
1596 }
1597 
1598 extern __inline __m128i
1599  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600 
1601  _mm_srli_epi16(__m128i __A, int __B) {
1602  __v8hu __rshift;
1603  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1604 
1605  if (__B < 16) {
1606  if (__builtin_constant_p(__B))
1607  __rshift = (__v8hu)vec_splat_s16(__B);
1608  else
1609  __rshift = vec_splats((unsigned short)__B);
1610 
1611  __result = vec_sr((__v8hi)__A, __rshift);
1612  }
1613 
1614  return (__m128i)__result;
1615 }
1616 
1617 extern __inline __m128i
1618  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619  _mm_srli_epi32(__m128i __A, int __B) {
1620  __v4su __rshift;
1621  __v4si __result = {0, 0, 0, 0};
1622 
1623  if (__B < 32) {
1624  if (__builtin_constant_p(__B)) {
1625  if (__B < 16)
1626  __rshift = (__v4su)vec_splat_s32(__B);
1627  else
1628  __rshift = (__v4su)vec_splats((unsigned int)__B);
1629  } else
1630  __rshift = vec_splats((unsigned int)__B);
1631 
1632  __result = vec_sr((__v4si)__A, __rshift);
1633  }
1634 
1635  return (__m128i)__result;
1636 }
1637 
1638 #ifdef _ARCH_PWR8
1639 extern __inline __m128i
1640  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641  _mm_srli_epi64(__m128i __A, int __B) {
1642  __v2du __rshift;
1643  __v2di __result = {0, 0};
1644 
1645  if (__B < 64) {
1646  if (__builtin_constant_p(__B)) {
1647  if (__B < 16)
1648  __rshift = (__v2du)vec_splat_s32(__B);
1649  else
1650  __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651  } else
1652  __rshift = (__v2du)vec_splats((unsigned int)__B);
1653 
1654  __result = vec_sr((__v2di)__A, __rshift);
1655  }
1656 
1657  return (__m128i)__result;
1658 }
1659 #endif
1660 
1661 extern __inline __m128i
1662  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663  _mm_sll_epi16(__m128i __A, __m128i __B) {
1664  __v8hu __lshift;
1665  __vector __bool short __shmask;
1666  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667  __v8hu __result;
1668 
1669 #ifdef __LITTLE_ENDIAN__
1670  __lshift = vec_splat((__v8hu)__B, 0);
1671 #else
1672  __lshift = vec_splat((__v8hu)__B, 3);
1673 #endif
1674  __shmask = vec_cmple(__lshift, __shmax);
1675  __result = vec_sl((__v8hu)__A, __lshift);
1676  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1677 
1678  return (__m128i)__result;
1679 }
1680 
1681 extern __inline __m128i
1682  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683  _mm_sll_epi32(__m128i __A, __m128i __B) {
1684  __v4su __lshift;
1685  __vector __bool int __shmask;
1686  const __v4su __shmax = {32, 32, 32, 32};
1687  __v4su __result;
1688 #ifdef __LITTLE_ENDIAN__
1689  __lshift = vec_splat((__v4su)__B, 0);
1690 #else
1691  __lshift = vec_splat((__v4su)__B, 1);
1692 #endif
1693  __shmask = vec_cmplt(__lshift, __shmax);
1694  __result = vec_sl((__v4su)__A, __lshift);
1695  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1696 
1697  return (__m128i)__result;
1698 }
1699 
1700 #ifdef _ARCH_PWR8
1701 extern __inline __m128i
1702  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703  _mm_sll_epi64(__m128i __A, __m128i __B) {
1704  __v2du __lshift;
1705  __vector __bool long long __shmask;
1706  const __v2du __shmax = {64, 64};
1707  __v2du __result;
1708 
1709  __lshift = vec_splat((__v2du)__B, 0);
1710  __shmask = vec_cmplt(__lshift, __shmax);
1711  __result = vec_sl((__v2du)__A, __lshift);
1712  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1713 
1714  return (__m128i)__result;
1715 }
1716 #endif
1717 
1718 extern __inline __m128i
1719  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720  _mm_sra_epi16(__m128i __A, __m128i __B) {
1721  const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722  __v8hu __rshift;
1723  __v8hi __result;
1724 
1725 #ifdef __LITTLE_ENDIAN__
1726  __rshift = vec_splat((__v8hu)__B, 0);
1727 #else
1728  __rshift = vec_splat((__v8hu)__B, 3);
1729 #endif
1730  __rshift = vec_min(__rshift, __rshmax);
1731  __result = vec_sra((__v8hi)__A, __rshift);
1732 
1733  return (__m128i)__result;
1734 }
1735 
1736 extern __inline __m128i
1737  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738  _mm_sra_epi32(__m128i __A, __m128i __B) {
1739  const __v4su __rshmax = {31, 31, 31, 31};
1740  __v4su __rshift;
1741  __v4si __result;
1742 
1743 #ifdef __LITTLE_ENDIAN__
1744  __rshift = vec_splat((__v4su)__B, 0);
1745 #else
1746  __rshift = vec_splat((__v4su)__B, 1);
1747 #endif
1748  __rshift = vec_min(__rshift, __rshmax);
1749  __result = vec_sra((__v4si)__A, __rshift);
1750 
1751  return (__m128i)__result;
1752 }
1753 
1754 extern __inline __m128i
1755  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756  _mm_srl_epi16(__m128i __A, __m128i __B) {
1757  __v8hu __rshift;
1758  __vector __bool short __shmask;
1759  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760  __v8hu __result;
1761 
1762 #ifdef __LITTLE_ENDIAN__
1763  __rshift = vec_splat((__v8hu)__B, 0);
1764 #else
1765  __rshift = vec_splat((__v8hu)__B, 3);
1766 #endif
1767  __shmask = vec_cmple(__rshift, __shmax);
1768  __result = vec_sr((__v8hu)__A, __rshift);
1769  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1770 
1771  return (__m128i)__result;
1772 }
1773 
1774 extern __inline __m128i
1775  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776  _mm_srl_epi32(__m128i __A, __m128i __B) {
1777  __v4su __rshift;
1778  __vector __bool int __shmask;
1779  const __v4su __shmax = {32, 32, 32, 32};
1780  __v4su __result;
1781 
1782 #ifdef __LITTLE_ENDIAN__
1783  __rshift = vec_splat((__v4su)__B, 0);
1784 #else
1785  __rshift = vec_splat((__v4su)__B, 1);
1786 #endif
1787  __shmask = vec_cmplt(__rshift, __shmax);
1788  __result = vec_sr((__v4su)__A, __rshift);
1789  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1790 
1791  return (__m128i)__result;
1792 }
1793 
1794 #ifdef _ARCH_PWR8
1795 extern __inline __m128i
1796  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797  _mm_srl_epi64(__m128i __A, __m128i __B) {
1798  __v2du __rshift;
1799  __vector __bool long long __shmask;
1800  const __v2du __shmax = {64, 64};
1801  __v2du __result;
1802 
1803  __rshift = vec_splat((__v2du)__B, 0);
1804  __shmask = vec_cmplt(__rshift, __shmax);
1805  __result = vec_sr((__v2du)__A, __rshift);
1806  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1807 
1808  return (__m128i)__result;
1809 }
1810 #endif
1811 
1812 extern __inline __m128d
1813  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1814  _mm_and_pd(__m128d __A, __m128d __B) {
1815  return (vec_and((__v2df)__A, (__v2df)__B));
1816 }
1817 
1818 extern __inline __m128d
1819  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1820  _mm_andnot_pd(__m128d __A, __m128d __B) {
1821  return (vec_andc((__v2df)__B, (__v2df)__A));
1822 }
1823 
1824 extern __inline __m128d
1825  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1826  _mm_or_pd(__m128d __A, __m128d __B) {
1827  return (vec_or((__v2df)__A, (__v2df)__B));
1828 }
1829 
1830 extern __inline __m128d
1831  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832  _mm_xor_pd(__m128d __A, __m128d __B) {
1833  return (vec_xor((__v2df)__A, (__v2df)__B));
1834 }
1835 
1836 extern __inline __m128i
1837  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1838  _mm_and_si128(__m128i __A, __m128i __B) {
1839  return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1840 }
1841 
1842 extern __inline __m128i
1843  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1844  _mm_andnot_si128(__m128i __A, __m128i __B) {
1845  return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1846 }
1847 
1848 extern __inline __m128i
1849  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1850  _mm_or_si128(__m128i __A, __m128i __B) {
1851  return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1852 }
1853 
1854 extern __inline __m128i
1855  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1856  _mm_xor_si128(__m128i __A, __m128i __B) {
1857  return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1858 }
1859 
1860 extern __inline __m128i
1861  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1862  _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863  return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1864 }
1865 
1866 extern __inline __m128i
1867  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868  _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869  return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1870 }
1871 
1872 extern __inline __m128i
1873  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874  _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875  return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1876 }
1877 
1878 extern __inline __m128i
1879  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880  _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881  return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1882 }
1883 
1884 extern __inline __m128i
1885  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886  _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887  return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1888 }
1889 
1890 extern __inline __m128i
1891  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892  _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893  return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1894 }
1895 
1896 extern __inline __m128i
1897  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898  _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899  return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1900 }
1901 
1902 extern __inline __m128i
1903  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904  _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905  return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1906 }
1907 
1908 extern __inline __m128i
1909  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910  _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911  return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1912 }
1913 
1914 extern __inline int
1915  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916  _mm_extract_epi16(__m128i const __A, int const __N) {
1917  return (unsigned short)((__v8hi)__A)[__N & 7];
1918 }
1919 
1920 extern __inline __m128i
1921  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922  _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923  __v8hi __result = (__v8hi)__A;
1924 
1925  __result[(__N & 7)] = __D;
1926 
1927  return (__m128i)__result;
1928 }
1929 
1930 extern __inline __m128i
1931  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1932  _mm_max_epi16(__m128i __A, __m128i __B) {
1933  return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1934 }
1935 
1936 extern __inline __m128i
1937  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1938  _mm_max_epu8(__m128i __A, __m128i __B) {
1939  return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1940 }
1941 
1942 extern __inline __m128i
1943  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1944  _mm_min_epi16(__m128i __A, __m128i __B) {
1945  return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1946 }
1947 
1948 extern __inline __m128i
1949  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1950  _mm_min_epu8(__m128i __A, __m128i __B) {
1951  return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1952 }
1953 
1954 #ifdef _ARCH_PWR8
1955 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1956 
1957 /* Return a mask created from the most significant bit of each 8-bit
1958  element in A. */
1959 extern __inline int
1960  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961  _mm_movemask_epi8(__m128i __A) {
1962 #ifdef _ARCH_PWR10
1963  return vec_extractm((__v16qu)__A);
1964 #else
1965  __vector unsigned long long __result;
1966  static const __vector unsigned char __perm_mask = {
1967  0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968  0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1969 
1970  __result = ((__vector unsigned long long)vec_vbpermq(
1971  (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1972 
1973 #ifdef __LITTLE_ENDIAN__
1974  return __result[1];
1975 #else
1976  return __result[0];
1977 #endif
1978 #endif /* !_ARCH_PWR10 */
1979 }
1980 #endif /* _ARCH_PWR8 */
1981 
1982 extern __inline __m128i
1983  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1984  _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985  __v4su __w0, __w1;
1986  __v16qu __xform1 = {
1987 #ifdef __LITTLE_ENDIAN__
1988  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989  0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990 #else
1991  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992  0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993 #endif
1994  };
1995 
1996  __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997  __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998  return (__m128i)vec_perm(__w0, __w1, __xform1);
1999 }
2000 
2001 extern __inline __m128i
2002  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003  _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004  unsigned long __element_selector_98 = __mask & 0x03;
2005  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008  static const unsigned short __permute_selectors[4] = {
2009 #ifdef __LITTLE_ENDIAN__
2010  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011 #else
2012  0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013 #endif
2014  };
2015  __v2du __pmask =
2016 #ifdef __LITTLE_ENDIAN__
2017  {0x1716151413121110UL, 0UL};
2018 #else
2019  {0x1011121314151617UL, 0UL};
2020 #endif
2021  __m64_union __t;
2022  __v2du __a, __r;
2023 
2024  __t.as_short[0] = __permute_selectors[__element_selector_98];
2025  __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026  __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027  __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028  __pmask[1] = __t.as_m64;
2029  __a = (__v2du)__A;
2030  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031  return (__m128i)__r;
2032 }
2033 
2034 extern __inline __m128i
2035  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036  _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037  unsigned long __element_selector_10 = __mask & 0x03;
2038  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041  static const unsigned short __permute_selectors[4] = {
2042 #ifdef __LITTLE_ENDIAN__
2043  0x0100, 0x0302, 0x0504, 0x0706
2044 #else
2045  0x0001, 0x0203, 0x0405, 0x0607
2046 #endif
2047  };
2048  __v2du __pmask =
2049 #ifdef __LITTLE_ENDIAN__
2050  {0UL, 0x1f1e1d1c1b1a1918UL};
2051 #else
2052  {0UL, 0x18191a1b1c1d1e1fUL};
2053 #endif
2054  __m64_union __t;
2055  __v2du __a, __r;
2056  __t.as_short[0] = __permute_selectors[__element_selector_10];
2057  __t.as_short[1] = __permute_selectors[__element_selector_32];
2058  __t.as_short[2] = __permute_selectors[__element_selector_54];
2059  __t.as_short[3] = __permute_selectors[__element_selector_76];
2060  __pmask[0] = __t.as_m64;
2061  __a = (__v2du)__A;
2062  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063  return (__m128i)__r;
2064 }
2065 
2066 extern __inline __m128i
2067  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2068  _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069  unsigned long __element_selector_10 = __mask & 0x03;
2070  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073  static const unsigned int __permute_selectors[4] = {
2074 #ifdef __LITTLE_ENDIAN__
2075  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076 #else
2077  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078 #endif
2079  };
2080  __v4su __t;
2081 
2082  __t[0] = __permute_selectors[__element_selector_10];
2083  __t[1] = __permute_selectors[__element_selector_32];
2084  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086  return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087  (__vector unsigned char)__t);
2088 }
2089 
2090 extern __inline void
2091  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2092  _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093  __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094  __v16qu __mask, __tmp;
2095  __m128i_u *__p = (__m128i_u *)__C;
2096 
2097  __tmp = (__v16qu)_mm_loadu_si128(__p);
2098  __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099  __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100  _mm_storeu_si128(__p, (__m128i)__tmp);
2101 }
2102 
2103 extern __inline __m128i
2104  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2105  _mm_avg_epu8(__m128i __A, __m128i __B) {
2106  return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2107 }
2108 
2109 extern __inline __m128i
2110  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2111  _mm_avg_epu16(__m128i __A, __m128i __B) {
2112  return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2113 }
2114 
2115 extern __inline __m128i
2116  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2117  _mm_sad_epu8(__m128i __A, __m128i __B) {
2118  __v16qu __a, __b;
2119  __v16qu __vabsdiff;
2120  __v4si __vsum;
2121  const __v4su __zero = {0, 0, 0, 0};
2122  __v4si __result;
2123 
2124  __a = (__v16qu)__A;
2125  __b = (__v16qu)__B;
2126 #ifndef _ARCH_PWR9
2127  __v16qu __vmin = vec_min(__a, __b);
2128  __v16qu __vmax = vec_max(__a, __b);
2129  __vabsdiff = vec_sub(__vmax, __vmin);
2130 #else
2131  __vabsdiff = vec_absd(__a, __b);
2132 #endif
2133  /* Sum four groups of bytes into integers. */
2134  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135 #ifdef __LITTLE_ENDIAN__
2136  /* Sum across four integers with two integer results. */
2137  __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138  /* Note: vec_sum2s could be used here, but on little-endian, vector
2139  shifts are added that are not needed for this use-case.
2140  A vector shift to correctly position the 32-bit integer results
2141  (currently at [0] and [2]) to [1] and [3] would then need to be
2142  swapped back again since the desired results are two 64-bit
2143  integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2144 #else
2145  /* Sum across four integers with two integer results. */
2146  __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147  /* Rotate the sums into the correct position. */
2148  __result = vec_sld(__result, __result, 6);
2149 #endif
2150  return (__m128i)__result;
2151 }
2152 
2153 extern __inline void
2154  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155  _mm_stream_si32(int *__A, int __B) {
2156  /* Use the data cache block touch for store transient. */
2157  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158  *__A = __B;
2159 }
2160 
2161 extern __inline void
2162  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2163  _mm_stream_si64(long long int *__A, long long int __B) {
2164  /* Use the data cache block touch for store transient. */
2165  __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2166  *__A = __B;
2167 }
2168 
2169 extern __inline void
2170  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171  _mm_stream_si128(__m128i *__A, __m128i __B) {
2172  /* Use the data cache block touch for store transient. */
2173  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174  *__A = __B;
2175 }
2176 
2177 extern __inline void
2178  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2179  _mm_stream_pd(double *__A, __m128d __B) {
2180  /* Use the data cache block touch for store transient. */
2181  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182  *(__m128d *)__A = __B;
2183 }
2184 
2185 extern __inline void
2186  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2187  _mm_clflush(void const *__A) {
2188  /* Use the data cache block flush. */
2189  __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2190 }
2191 
2192 extern __inline void
2193  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2194  _mm_lfence(void) {
2195  /* Use light weight sync for load to load ordering. */
2196  __atomic_thread_fence(__ATOMIC_RELEASE);
2197 }
2198 
2199 extern __inline void
2200  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201  _mm_mfence(void) {
2202  /* Use heavy weight sync for any to any ordering. */
2203  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2204 }
2205 
2206 extern __inline __m128i
2207  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2208  _mm_cvtsi32_si128(int __A) {
2209  return _mm_set_epi32(0, 0, 0, __A);
2210 }
2211 
2212 extern __inline __m128i
2213  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214  _mm_cvtsi64_si128(long long __A) {
2215  return __extension__(__m128i)(__v2di){__A, 0LL};
2216 }
2217 
2218 /* Microsoft intrinsic. */
2219 extern __inline __m128i
2220  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2221  _mm_cvtsi64x_si128(long long __A) {
2222  return __extension__(__m128i)(__v2di){__A, 0LL};
2223 }
2224 
2225 /* Casts between various SP, DP, INT vector types. Note that these do no
2226  conversion of values, they just change the type. */
2227 extern __inline __m128
2228  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2229  _mm_castpd_ps(__m128d __A) {
2230  return (__m128)__A;
2231 }
2232 
2233 extern __inline __m128i
2234  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2235  _mm_castpd_si128(__m128d __A) {
2236  return (__m128i)__A;
2237 }
2238 
2239 extern __inline __m128d
2240  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2241  _mm_castps_pd(__m128 __A) {
2242  return (__m128d)__A;
2243 }
2244 
2245 extern __inline __m128i
2246  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2247  _mm_castps_si128(__m128 __A) {
2248  return (__m128i)__A;
2249 }
2250 
2251 extern __inline __m128
2252  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253  _mm_castsi128_ps(__m128i __A) {
2254  return (__m128)__A;
2255 }
2256 
2257 extern __inline __m128d
2258  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2259  _mm_castsi128_pd(__m128i __A) {
2260  return (__m128d)__A;
2261 }
2262 
2263 #else
2264 #include_next <emmintrin.h>
2265 #endif /* defined(__ppc64__) &&
2266  * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2267 
2268 #endif /* EMMINTRIN_H_ */
_mm_xor_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:397
_mm_set_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1777
vec_cmple
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369
_mm_load1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1523
_mm_cvtepi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1287
_mm_cvtpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1444
_mm_undefined_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1709
_mm_set_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3464
_mm_mul_epu32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2399
_mm_srli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2977
_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3013
vec_st
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11172
_mm_cmpnle_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:884
_mm_add_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2027
_mm_loadl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3379
_mm_madd_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2230
_mm_unpacklo_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4592
vec_unpackl
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:12769
_mm_slli_si128
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2675
_mm_insert_epi16
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition: emmintrin.h:4178
_mm_set_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3416
_mm_add_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2048
_mm_loadu_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3359
_mm_sqrt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:226
_mm_cmpgt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:733
_mm_cmpgt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3110
vec_splat_s16
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:10325
_mm_subs_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2584
_mm_unpackhi_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4572
_mm_comige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1057
_mm_sub_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:104
_mm_srai_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2843
_mm_srli_si128
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2884
_mm_ucomigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1180
vec_and
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
vec_packsu
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7832
_mm_cmpunord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:812
_mm_shuffle_epi32
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4229
_mm_div_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:183
_mm_and_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2601
_mm_add_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2086
_mm_set_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3553
_mm_cvtsd_ss
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1343
vec_sro
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10967
_mm_unpacklo_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4485
_mm_cmpeq_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3031
int
__device__ int
Definition: __clang_hip_libdevice_declares.h:63
_mm_set_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1727
_mm_bsrli_si128
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:2888
_mm_max_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:326
_mm_subs_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2526
_mm_setr_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3661
_mm_sll_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2786
__a
static __inline__ void int __a
Definition: emmintrin.h:3976
_mm_ucomineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1230
_mm_ucomile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1155
_mm_castpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4656
_mm_mul_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:161
vec_adds
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626
_mm_sub_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2506
_mm_set1_epi64x
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3574
_mm_storel_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1982
vec_nor
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6717
vec_ld
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4049
_mm_sra_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2862
_mm_storeu_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1924
_mm_unpackhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4355
_mm_load_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1638
_mm_undefined_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3396
vec_mergel
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5349
_mm_store_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1868
__P
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
_mm_packus_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4119
_mm_srl_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2995
_mm_setr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1797
_mm_cmpneq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:836
_mm_comineq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1082
__Y
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
_mm_bslli_si128
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2679
_mm_cvtsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1320
vec_sra
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10515
vec_sl
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8870
_mm_cmpgt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:474
_mm_max_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2268
vec_vmrghw
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5314
_mm_cvttps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3269
_mm_maskmoveu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3897
vec_splat
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10078
_mm_cmpeq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:415
_mm_cmple_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:454
vec_xor
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13195
_mm_loadh_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1663
vec_sld
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:9137
_mm_cmpge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:758
vec_cts
#define vec_cts
Definition: altivec.h:3311
_mm_sqrt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:243
_mm_cvtps_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3254
_mm_movepi64_pi64
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4522
vec_or
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6853
vec_cmplt
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
_mm_min_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2287
_mm_xor_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2653
_mm_andnot_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:363
_mm_set1_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3591
_mm_cmpnge_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:933
_mm_unpackhi_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4399
_mm_cvtps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1266
_mm_cvtsi32_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1365
_mm_add_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:64
_mm_setr_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3762
_mm_cmpeq_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3049
_mm_cmplt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:685
_mm_unpacklo_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4433
_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3642
vec_slo
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:9872
_mm_cmple_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:709
_mm_cvttpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1413
vec_cmpge
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243
_mm_mulhi_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2325
_mm_comile_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1007
_mm_loadu_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1563
_mm_sra_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2824
_mm_cvtpi32_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1477
_mm_clflush
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
_mm_cmplt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3130
_mm_move_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4553
_mm_mulhi_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2344
_mm_shufflelo_epi16
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4262
_mm_cmpnle_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:599
altivec.h
_mm_avg_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2205
_mm_load_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3344
_mm_mullo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2363
_mm_cvtsi128_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3313
__D
static __inline__ void short __D
Definition: immintrin.h:382
_mm_cmpeq_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:661
_mm_shufflehi_epi16
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4295
vec_cmpeq
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
_mm_move_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1830
_mm_sub_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2471
_mm_srl_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2923
_mm_cmpnlt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:860
vec_splats
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14706
_mm_cvtsi64_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3298
_mm_unpackhi_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4378
_mm_sub_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2454
vec_subs
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12137
_mm_adds_epu16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2167
_mm_slli_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2768
_mm_sll_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2750
_mm_set_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3504
__attribute__
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
_mm_castsi128_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4716
_mm_storeh_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1963
_mm_srli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2941
vec_andc
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235
vec_perm
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7950
_mm_unpacklo_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4462
_mm_packs_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4092
_mm_cvtepi32_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3240
_mm_comieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:957
_mm_slli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2696
_mm_ucomieq_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1105
vec_sub
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11857
_mm_store_pd1
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1907
_mm_setzero_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3777
_mm_castps_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4701
_mm_adds_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2147
_mm_div_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:202
_mm_or_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:380
_mm_andnot_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2620
_mm_unpacklo_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4506
vec_max
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4826
_mm_cmpunord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:539
_mm_cvtpd_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1305
_mm_subs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2546
_mm_storeu_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3808
_mm_mul_su32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2381
vec_ctf
#define vec_ctf(__a, __b)
Definition: altivec.h:3241
vec_avg
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586
_mm_stream_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:3938
_mm_set1_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3625
_mm_loadl_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1689
_mm_slli_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2732
_mm_cmpge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:494
_mm_sll_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2714
__p
static __inline unsigned char unsigned int unsigned int unsigned int * __p
Definition: adxintrin.h:24
_mm_shuffle_pd
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4641
_mm_cvttpd_pi32
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1462
vec_splat_s32
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:10341
_mm_min_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:284
_mm_packs_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition: emmintrin.h:4065
_mm_sad_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2420
_mm_comigt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1032
vec_unpackh
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12630
_mm_comilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:982
vec_mule
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6251
_mm_storer_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1946
_mm_setr_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3715
_mm_cvtss_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1390
_mm_load_pd1
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1531
_mm_srai_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2805
_mm_cmplt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:434
_mm_castps_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4686
_mm_movemask_pd
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4610
_mm_min_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:265
_mm_sub_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:122
_mm_cmpneq_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:559
_mm_movpi64_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4537
_mm_cmpord_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:516
_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4194
vec_min
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5730
_mm_set1_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3608
_mm_set1_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1743
_mm_store1_pd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1887
_mm_add_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:82
_mm_cvtsi128_si64
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3329
_mm_avg_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2186
_mm_or_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2636
_mm_adds_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2106
vec_cmpgt
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
__b
static __inline__ vector float vector float __b
Definition: altivec.h:578
_mm_mfence
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
_mm_cmpnlt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:579
_mm_cmplt_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3170
_mm_store_sd
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1847
_mm_lfence
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
_mm_add_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2006
_mm_loadr_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1547
_mm_cmpgt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3090
vec_sum4s
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12475
_mm_sub_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2437
_mm_cmpgt_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3068
_mm_unpackhi_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4327
_mm_max_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:307
_mm_cvtpd_ps
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1248
_mm_ucomilt_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1130
_mm_store_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3793
_mm_cvtsd_f64
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1492
__c
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4788
_mm_castpd_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4671
_mm_stream_si128
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:3956
_mm_min_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2306
_mm_cmpnge_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:639
_mm_subs_epu8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2565
_mm_and_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:343
_mm_mul_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:143
_mm_adds_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2127
_mm_cmplt_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3150
_mm_storel_epi64
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3916
_mm_set_epi64
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3437
_mm_ucomige_sd
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1205
_mm_cvttsd_si32
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1429
_mm_cvtsi32_si128
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3283
vec_sel
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8576
_mm_cmpngt_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:619
_mm_set_pd1
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1759
_mm_castsi128_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4731
_mm_setzero_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1811
_mm_srli_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2905
_mm_srl_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2959
_mm_extract_epi16
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4150
vec_packs
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7703
_mm_max_epi16
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2249
_mm_setr_epi32
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3683
vec_mergeh
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5079
_mm_cmpngt_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:908
_mm_cmpord_sd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:785
vec_sr
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10381
_mm_load_pd
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1507