clang  10.0.0svn
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18  PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE2 scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE2 scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications.
31 */
32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef EMMINTRIN_H_
36 #define EMMINTRIN_H_
37 
38 #include <altivec.h>
39 
40 /* We need definitions from the SSE header files. */
41 #include <xmmintrin.h>
42 
43 /* SSE2 */
44 typedef __vector double __v2df;
45 typedef __vector long long __v2di;
46 typedef __vector unsigned long long __v2du;
47 typedef __vector int __v4si;
48 typedef __vector unsigned int __v4su;
49 typedef __vector short __v8hi;
50 typedef __vector unsigned short __v8hu;
51 typedef __vector signed char __v16qi;
52 typedef __vector unsigned char __v16qu;
53 
54 /* The Intel API is flexible enough that we must allow aliasing with other
55  vector types, and their scalar components. */
56 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
57 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
58 
59 /* Unaligned version of the same types. */
60 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
61 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
62 
63 /* Define two value permute mask. */
64 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
65 
66 /* Create a vector with element 0 as F and the rest zero. */
67 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68 _mm_set_sd (double __F)
69 {
70  return __extension__ (__m128d){ __F, 0.0 };
71 }
72 
73 /* Create a vector with both elements equal to F. */
74 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_set1_pd (double __F)
76 {
77  return __extension__ (__m128d){ __F, __F };
78 }
79 
80 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81 _mm_set_pd1 (double __F)
82 {
83  return _mm_set1_pd (__F);
84 }
85 
86 /* Create a vector with the lower value X and upper value W. */
87 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88 _mm_set_pd (double __W, double __X)
89 {
90  return __extension__ (__m128d){ __X, __W };
91 }
92 
93 /* Create a vector with the lower value W and upper value X. */
94 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_setr_pd (double __W, double __X)
96 {
97  return __extension__ (__m128d){ __W, __X };
98 }
99 
100 /* Create an undefined vector. */
101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102 _mm_undefined_pd (void)
103 {
104  __m128d __Y = __Y;
105  return __Y;
106 }
107 
108 /* Create a vector of zeros. */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_setzero_pd (void)
111 {
112  return (__m128d) vec_splats (0);
113 }
114 
115 /* Sets the low DPFP value of A from the low value of B. */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_move_sd (__m128d __A, __m128d __B)
118 {
119  __v2df result = (__v2df) __A;
120  result [0] = ((__v2df) __B)[0];
121  return (__m128d) result;
122 }
123 
124 /* Load two DPFP values from P. The address must be 16-byte aligned. */
125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_load_pd (double const *__P)
127 {
128  return ((__m128d)vec_ld(0, (__v16qu*)__P));
129 }
130 
131 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
132 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_loadu_pd (double const *__P)
134 {
135  return (vec_vsx_ld(0, __P));
136 }
137 
138 /* Create a vector with all two elements equal to *P. */
139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_load1_pd (double const *__P)
141 {
142  return (vec_splats (*__P));
143 }
144 
145 /* Create a vector with element 0 as *P and the rest zero. */
146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_load_sd (double const *__P)
148 {
149  return _mm_set_sd (*__P);
150 }
151 
152 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_load_pd1 (double const *__P)
154 {
155  return _mm_load1_pd (__P);
156 }
157 
158 /* Load two DPFP values in reverse order. The address must be aligned. */
159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_loadr_pd (double const *__P)
161 {
162  __v2df __tmp = _mm_load_pd (__P);
163  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
164 }
165 
166 /* Store two DPFP values. The address must be 16-byte aligned. */
167 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm_store_pd (double *__P, __m128d __A)
169 {
170  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
171 }
172 
173 /* Store two DPFP values. The address need not be 16-byte aligned. */
174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_storeu_pd (double *__P, __m128d __A)
176 {
177  *(__m128d_u *)__P = __A;
178 }
179 
180 /* Stores the lower DPFP value. */
181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_store_sd (double *__P, __m128d __A)
183 {
184  *__P = ((__v2df)__A)[0];
185 }
186 
187 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_cvtsd_f64 (__m128d __A)
189 {
190  return ((__v2df)__A)[0];
191 }
192 
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_storel_pd (double *__P, __m128d __A)
195 {
196  _mm_store_sd (__P, __A);
197 }
198 
199 /* Stores the upper DPFP value. */
200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_storeh_pd (double *__P, __m128d __A)
202 {
203  *__P = ((__v2df)__A)[1];
204 }
205 /* Store the lower DPFP value across two words.
206  The address must be 16-byte aligned. */
207 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_store1_pd (double *__P, __m128d __A)
209 {
210  _mm_store_pd (__P, vec_splat (__A, 0));
211 }
212 
213 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 _mm_store_pd1 (double *__P, __m128d __A)
215 {
216  _mm_store1_pd (__P, __A);
217 }
218 
219 /* Store two DPFP values in reverse order. The address must be aligned. */
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_storer_pd (double *__P, __m128d __A)
222 {
223  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
224 }
225 
226 /* Intel intrinsic. */
227 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_cvtsi128_si64 (__m128i __A)
229 {
230  return ((__v2di)__A)[0];
231 }
232 
233 /* Microsoft intrinsic. */
234 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_cvtsi128_si64x (__m128i __A)
236 {
237  return ((__v2di)__A)[0];
238 }
239 
240 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_add_pd (__m128d __A, __m128d __B)
242 {
243  return (__m128d) ((__v2df)__A + (__v2df)__B);
244 }
245 
246 /* Add the lower double-precision (64-bit) floating-point element in
247  a and b, store the result in the lower element of dst, and copy
248  the upper element from a to the upper element of dst. */
249 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm_add_sd (__m128d __A, __m128d __B)
251 {
252  __A[0] = __A[0] + __B[0];
253  return (__A);
254 }
255 
256 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm_sub_pd (__m128d __A, __m128d __B)
258 {
259  return (__m128d) ((__v2df)__A - (__v2df)__B);
260 }
261 
262 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_sub_sd (__m128d __A, __m128d __B)
264 {
265  __A[0] = __A[0] - __B[0];
266  return (__A);
267 }
268 
269 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_mul_pd (__m128d __A, __m128d __B)
271 {
272  return (__m128d) ((__v2df)__A * (__v2df)__B);
273 }
274 
275 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 _mm_mul_sd (__m128d __A, __m128d __B)
277 {
278  __A[0] = __A[0] * __B[0];
279  return (__A);
280 }
281 
282 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _mm_div_pd (__m128d __A, __m128d __B)
284 {
285  return (__m128d) ((__v2df)__A / (__v2df)__B);
286 }
287 
288 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_div_sd (__m128d __A, __m128d __B)
290 {
291  __A[0] = __A[0] / __B[0];
292  return (__A);
293 }
294 
295 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm_sqrt_pd (__m128d __A)
297 {
298  return (vec_sqrt (__A));
299 }
300 
301 /* Return pair {sqrt (B[0]), A[1]}. */
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_sqrt_sd (__m128d __A, __m128d __B)
304 {
305  __v2df c;
306  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
307  return (__m128d) _mm_setr_pd (c[0], __A[1]);
308 }
309 
310 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_min_pd (__m128d __A, __m128d __B)
312 {
313  return (vec_min (__A, __B));
314 }
315 
316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm_min_sd (__m128d __A, __m128d __B)
318 {
319  __v2df a, b, c;
320  a = vec_splats (__A[0]);
321  b = vec_splats (__B[0]);
322  c = vec_min (a, b);
323  return (__m128d) _mm_setr_pd (c[0], __A[1]);
324 }
325 
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_max_pd (__m128d __A, __m128d __B)
328 {
329  return (vec_max (__A, __B));
330 }
331 
332 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333 _mm_max_sd (__m128d __A, __m128d __B)
334 {
335  __v2df a, b, c;
336  a = vec_splats (__A[0]);
337  b = vec_splats (__B[0]);
338  c = vec_max (a, b);
339  return (__m128d) _mm_setr_pd (c[0], __A[1]);
340 }
341 
342 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_cmpeq_pd (__m128d __A, __m128d __B)
344 {
345  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
346 }
347 
348 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_cmplt_pd (__m128d __A, __m128d __B)
350 {
351  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
352 }
353 
354 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_cmple_pd (__m128d __A, __m128d __B)
356 {
357  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
358 }
359 
360 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_cmpgt_pd (__m128d __A, __m128d __B)
362 {
363  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
364 }
365 
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpge_pd (__m128d __A, __m128d __B)
368 {
369  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
370 }
371 
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmpneq_pd (__m128d __A, __m128d __B)
374 {
375  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
376  return ((__m128d)vec_nor (temp, temp));
377 }
378 
379 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
381 {
382  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
383 }
384 
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmpnle_pd (__m128d __A, __m128d __B)
387 {
388  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
389 }
390 
391 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmpngt_pd (__m128d __A, __m128d __B)
393 {
394  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
395 }
396 
397 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398 _mm_cmpnge_pd (__m128d __A, __m128d __B)
399 {
400  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
401 }
402 
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpord_pd (__m128d __A, __m128d __B)
405 {
406 #if _ARCH_PWR8
407  __v2du c, d;
408  /* Compare against self will return false (0's) if NAN. */
409  c = (__v2du)vec_cmpeq (__A, __A);
410  d = (__v2du)vec_cmpeq (__B, __B);
411 #else
412  __v2du a, b;
414  const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
415  a = (__v2du)vec_abs ((__v2df)__A);
416  b = (__v2du)vec_abs ((__v2df)__B);
417  c = (__v2du)vec_cmpgt (double_exp_mask, a);
418  d = (__v2du)vec_cmpgt (double_exp_mask, b);
419 #endif
420  /* A != NAN and B != NAN. */
421  return ((__m128d)vec_and(c, d));
422 }
423 
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425 _mm_cmpunord_pd (__m128d __A, __m128d __B)
426 {
427 #if _ARCH_PWR8
428  __v2du c, d;
429  /* Compare against self will return false (0's) if NAN. */
430  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
431  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
432  /* A == NAN OR B == NAN converts too:
433  NOT(A != NAN) OR NOT(B != NAN). */
434  c = vec_nor (c, c);
435  return ((__m128d)vec_orc(c, d));
436 #else
437  __v2du c, d;
438  /* Compare against self will return false (0's) if NAN. */
439  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
440  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
441  /* Convert the true ('1's) is NAN. */
442  c = vec_nor (c, c);
443  d = vec_nor (d, d);
444  return ((__m128d)vec_or(c, d));
445 #endif
446 }
447 
448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpeq_sd(__m128d __A, __m128d __B)
450 {
451  __v2df a, b, c;
452  /* PowerISA VSX does not allow partial (for just lower double)
453  results. So to insure we don't generate spurious exceptions
454  (from the upper double values) we splat the lower double
455  before we do the operation. */
456  a = vec_splats (__A[0]);
457  b = vec_splats (__B[0]);
458  c = (__v2df) vec_cmpeq(a, b);
459  /* Then we merge the lower double result with the original upper
460  double from __A. */
461  return (__m128d) _mm_setr_pd (c[0], __A[1]);
462 }
463 
464 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_cmplt_sd (__m128d __A, __m128d __B)
466 {
467  __v2df a, b, c;
468  a = vec_splats (__A[0]);
469  b = vec_splats (__B[0]);
470  c = (__v2df) vec_cmplt(a, b);
471  return (__m128d) _mm_setr_pd (c[0], __A[1]);
472 }
473 
474 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm_cmple_sd (__m128d __A, __m128d __B)
476 {
477  __v2df a, b, c;
478  a = vec_splats (__A[0]);
479  b = vec_splats (__B[0]);
480  c = (__v2df) vec_cmple(a, b);
481  return (__m128d) _mm_setr_pd (c[0], __A[1]);
482 }
483 
484 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485 _mm_cmpgt_sd (__m128d __A, __m128d __B)
486 {
487  __v2df a, b, c;
488  a = vec_splats (__A[0]);
489  b = vec_splats (__B[0]);
490  c = (__v2df) vec_cmpgt(a, b);
491  return (__m128d) _mm_setr_pd (c[0], __A[1]);
492 }
493 
494 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_cmpge_sd (__m128d __A, __m128d __B)
496 {
497  __v2df a, b, c;
498  a = vec_splats (__A[0]);
499  b = vec_splats (__B[0]);
500  c = (__v2df) vec_cmpge(a, b);
501  return (__m128d) _mm_setr_pd (c[0], __A[1]);
502 }
503 
504 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 _mm_cmpneq_sd (__m128d __A, __m128d __B)
506 {
507  __v2df a, b, c;
508  a = vec_splats (__A[0]);
509  b = vec_splats (__B[0]);
510  c = (__v2df) vec_cmpeq(a, b);
511  c = vec_nor (c, c);
512  return (__m128d) _mm_setr_pd (c[0], __A[1]);
513 }
514 
515 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
517 {
518  __v2df a, b, c;
519  a = vec_splats (__A[0]);
520  b = vec_splats (__B[0]);
521  /* Not less than is just greater than or equal. */
522  c = (__v2df) vec_cmpge(a, b);
523  return (__m128d) _mm_setr_pd (c[0], __A[1]);
524 }
525 
526 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_cmpnle_sd (__m128d __A, __m128d __B)
528 {
529  __v2df a, b, c;
530  a = vec_splats (__A[0]);
531  b = vec_splats (__B[0]);
532  /* Not less than or equal is just greater than. */
533  c = (__v2df) vec_cmpge(a, b);
534  return (__m128d) _mm_setr_pd (c[0], __A[1]);
535 }
536 
537 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpngt_sd (__m128d __A, __m128d __B)
539 {
540  __v2df a, b, c;
541  a = vec_splats (__A[0]);
542  b = vec_splats (__B[0]);
543  /* Not greater than is just less than or equal. */
544  c = (__v2df) vec_cmple(a, b);
545  return (__m128d) _mm_setr_pd (c[0], __A[1]);
546 }
547 
548 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _mm_cmpnge_sd (__m128d __A, __m128d __B)
550 {
551  __v2df a, b, c;
552  a = vec_splats (__A[0]);
553  b = vec_splats (__B[0]);
554  /* Not greater than or equal is just less than. */
555  c = (__v2df) vec_cmplt(a, b);
556  return (__m128d) _mm_setr_pd (c[0], __A[1]);
557 }
558 
559 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
560 _mm_cmpord_sd (__m128d __A, __m128d __B)
561 {
562  __v2df r;
563  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
564  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
565 }
566 
567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568 _mm_cmpunord_sd (__m128d __A, __m128d __B)
569 {
570  __v2df r;
571  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
572  return (__m128d) _mm_setr_pd (r[0], __A[1]);
573 }
574 
575 /* FIXME
576  The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
577  exactly the same because GCC for PowerPC only generates unordered
578  compares (scalar and vector).
579  Technically __mm_comieq_sp et all should be using the ordered
580  compare and signal for QNaNs. The __mm_ucomieq_sd et all should
581  be OK. */
582 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583 _mm_comieq_sd (__m128d __A, __m128d __B)
584 {
585  return (__A[0] == __B[0]);
586 }
587 
588 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_comilt_sd (__m128d __A, __m128d __B)
590 {
591  return (__A[0] < __B[0]);
592 }
593 
594 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 _mm_comile_sd (__m128d __A, __m128d __B)
596 {
597  return (__A[0] <= __B[0]);
598 }
599 
600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 _mm_comigt_sd (__m128d __A, __m128d __B)
602 {
603  return (__A[0] > __B[0]);
604 }
605 
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comige_sd (__m128d __A, __m128d __B)
608 {
609  return (__A[0] >= __B[0]);
610 }
611 
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comineq_sd (__m128d __A, __m128d __B)
614 {
615  return (__A[0] != __B[0]);
616 }
617 
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_ucomieq_sd (__m128d __A, __m128d __B)
620 {
621  return (__A[0] == __B[0]);
622 }
623 
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_ucomilt_sd (__m128d __A, __m128d __B)
626 {
627  return (__A[0] < __B[0]);
628 }
629 
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_ucomile_sd (__m128d __A, __m128d __B)
632 {
633  return (__A[0] <= __B[0]);
634 }
635 
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_ucomigt_sd (__m128d __A, __m128d __B)
638 {
639  return (__A[0] > __B[0]);
640 }
641 
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomige_sd (__m128d __A, __m128d __B)
644 {
645  return (__A[0] >= __B[0]);
646 }
647 
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomineq_sd (__m128d __A, __m128d __B)
650 {
651  return (__A[0] != __B[0]);
652 }
653 
654 /* Create a vector of Qi, where i is the element number. */
655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _mm_set_epi64x (long long __q1, long long __q0)
657 {
658  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
659 }
660 
661 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663 {
664  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
665 }
666 
667 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
669 {
670  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
671 }
672 
673 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
675  short __q3, short __q2, short __q1, short __q0)
676 {
677  return __extension__ (__m128i)(__v8hi){
678  __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
679 }
680 
681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
683  char __q11, char __q10, char __q09, char __q08,
684  char __q07, char __q06, char __q05, char __q04,
685  char __q03, char __q02, char __q01, char __q00)
686 {
687  return __extension__ (__m128i)(__v16qi){
689  __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
690  };
691 }
692 
693 /* Set all of the elements of the vector to A. */
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_set1_epi64x (long long __A)
696 {
697  return _mm_set_epi64x (__A, __A);
698 }
699 
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_set1_epi64 (__m64 __A)
702 {
703  return _mm_set_epi64 (__A, __A);
704 }
705 
706 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 _mm_set1_epi32 (int __A)
708 {
709  return _mm_set_epi32 (__A, __A, __A, __A);
710 }
711 
712 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm_set1_epi16 (short __A)
714 {
715  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
716 }
717 
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi8 (char __A)
720 {
721  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
722  __A, __A, __A, __A, __A, __A, __A, __A);
723 }
724 
725 /* Create a vector of Qi, where i is the element number.
726  The parameter order is reversed from the _mm_set_epi* functions. */
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729 {
730  return _mm_set_epi64 (__q1, __q0);
731 }
732 
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
735 {
736  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
737 }
738 
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
741  short __q4, short __q5, short __q6, short __q7)
742 {
743  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
744 }
745 
746 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
748  char __q04, char __q05, char __q06, char __q07,
749  char __q08, char __q09, char __q10, char __q11,
750  char __q12, char __q13, char __q14, char __q15)
751 {
752  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
753  __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
754 }
755 
756 /* Create a vector with element 0 as *P and the rest zero. */
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_load_si128 (__m128i const *__P)
759 {
760  return *__P;
761 }
762 
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_loadu_si128 (__m128i_u const *__P)
765 {
766  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
767 }
768 
769 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _mm_loadl_epi64 (__m128i_u const *__P)
771 {
772  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
773 }
774 
775 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm_store_si128 (__m128i *__P, __m128i __B)
777 {
778  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
779 }
780 
781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
783 {
784  *__P = __B;
785 }
786 
787 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
789 {
790  *(long long *)__P = ((__v2di)__B)[0];
791 }
792 
793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_movepi64_pi64 (__m128i_u __B)
795 {
796  return (__m64) ((__v2di)__B)[0];
797 }
798 
799 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801 {
802  return _mm_set_epi64 ((__m64)0LL, __A);
803 }
804 
805 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_move_epi64 (__m128i __A)
807 {
808  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
809 }
810 
811 /* Create an undefined vector. */
812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_undefined_si128 (void)
814 {
815  __m128i __Y = __Y;
816  return __Y;
817 }
818 
819 /* Create a vector of zeros. */
820 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm_setzero_si128 (void)
822 {
823  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
824 }
825 
826 #ifdef _ARCH_PWR8
827 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 _mm_cvtepi32_pd (__m128i __A)
829 {
830  __v2di val;
831  /* For LE need to generate Vector Unpack Low Signed Word.
832  Which is generated from unpackh. */
833  val = (__v2di)vec_unpackh ((__v4si)__A);
834 
835  return (__m128d)vec_ctf (val, 0);
836 }
837 #endif
838 
839 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840 _mm_cvtepi32_ps (__m128i __A)
841 {
842  return ((__m128)vec_ctf((__v4si)__A, 0));
843 }
844 
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_cvtpd_epi32 (__m128d __A)
847 {
848  __v2df rounded = vec_rint (__A);
849  __v4si result, temp;
850  const __v4si vzero =
851  { 0, 0, 0, 0 };
852 
853  /* VSX Vector truncate Double-Precision to integer and Convert to
854  Signed Integer Word format with Saturate. */
855  __asm__(
856  "xvcvdpsxws %x0,%x1"
857  : "=wa" (temp)
858  : "wa" (rounded)
859  : );
860 
861 #ifdef _ARCH_PWR8
862  temp = vec_mergeo (temp, temp);
863  result = (__v4si) vec_vpkudum ((__vector long long) temp,
864  (__vector long long) vzero);
865 #else
866  {
867  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
868  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
869  result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
870  }
871 #endif
872  return (__m128i) result;
873 }
874 
875 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876 _mm_cvtpd_pi32 (__m128d __A)
877 {
878  __m128i result = _mm_cvtpd_epi32(__A);
879 
880  return (__m64) result[0];
881 }
882 
883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm_cvtpd_ps (__m128d __A)
885 {
886  __v4sf result;
887  __v4si temp;
888  const __v4si vzero = { 0, 0, 0, 0 };
889 
890  __asm__(
891  "xvcvdpsp %x0,%x1"
892  : "=wa" (temp)
893  : "wa" (__A)
894  : );
895 
896 #ifdef _ARCH_PWR8
897  temp = vec_mergeo (temp, temp);
898  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
899  (__vector long long) vzero);
900 #else
901  {
902  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
903  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
904  result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
905  }
906 #endif
907  return ((__m128)result);
908 }
909 
910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm_cvttpd_epi32 (__m128d __A)
912 {
913  __v4si result;
914  __v4si temp;
915  const __v4si vzero = { 0, 0, 0, 0 };
916 
917  /* VSX Vector truncate Double-Precision to integer and Convert to
918  Signed Integer Word format with Saturate. */
919  __asm__(
920  "xvcvdpsxws %x0,%x1"
921  : "=wa" (temp)
922  : "wa" (__A)
923  : );
924 
925 #ifdef _ARCH_PWR8
926  temp = vec_mergeo (temp, temp);
927  result = (__v4si) vec_vpkudum ((__vector long long) temp,
928  (__vector long long) vzero);
929 #else
930  {
931  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
932  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
933  result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
934  }
935 #endif
936 
937  return ((__m128i) result);
938 }
939 
940 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm_cvttpd_pi32 (__m128d __A)
942 {
943  __m128i result = _mm_cvttpd_epi32 (__A);
944 
945  return (__m64) result[0];
946 }
947 
948 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949 _mm_cvtsi128_si32 (__m128i __A)
950 {
951  return ((__v4si)__A)[0];
952 }
953 
954 #ifdef _ARCH_PWR8
955 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_cvtpi32_pd (__m64 __A)
957 {
958  __v4si temp;
959  __v2di tmp2;
960  __v2df result;
961 
962  temp = (__v4si)vec_splats (__A);
963  tmp2 = (__v2di)vec_unpackl (temp);
964  result = vec_ctf ((__vector signed long long) tmp2, 0);
965  return (__m128d)result;
966 }
967 #endif
968 
969 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970 _mm_cvtps_epi32 (__m128 __A)
971 {
972  __v4sf rounded;
973  __v4si result;
974 
975  rounded = vec_rint((__v4sf) __A);
976  result = vec_cts (rounded, 0);
977  return (__m128i) result;
978 }
979 
980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm_cvttps_epi32 (__m128 __A)
982 {
983  __v4si result;
984 
985  result = vec_cts ((__v4sf) __A, 0);
986  return (__m128i) result;
987 }
988 
989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_cvtps_pd (__m128 __A)
991 {
992  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
993 #ifdef vec_doubleh
994  return (__m128d) vec_doubleh ((__v4sf)__A);
995 #else
996  /* Otherwise the compiler is not current and so need to generate the
997  equivalent code. */
998  __v4sf a = (__v4sf)__A;
999  __v4sf temp;
1000  __v2df result;
1001 #ifdef __LITTLE_ENDIAN__
1002  /* The input float values are in elements {[0], [1]} but the convert
1003  instruction needs them in elements {[1], [3]}, So we use two
1004  shift left double vector word immediates to get the elements
1005  lined up. */
1006  temp = __builtin_vsx_xxsldwi (a, a, 3);
1007  temp = __builtin_vsx_xxsldwi (a, temp, 2);
1008 #else
1009  /* The input float values are in elements {[0], [1]} but the convert
1010  instruction needs them in elements {[0], [2]}, So we use two
1011  shift left double vector word immediates to get the elements
1012  lined up. */
1013  temp = vec_vmrghw (a, a);
1014 #endif
1015  __asm__(
1016  " xvcvspdp %x0,%x1"
1017  : "=wa" (result)
1018  : "wa" (temp)
1019  : );
1020  return (__m128d) result;
1021 #endif
1022 }
1023 
1024 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm_cvtsd_si32 (__m128d __A)
1026 {
1027  __v2df rounded = vec_rint((__v2df) __A);
1028  int result = ((__v2df)rounded)[0];
1029 
1030  return result;
1031 }
1032 /* Intel intrinsic. */
1033 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_cvtsd_si64 (__m128d __A)
1035 {
1036  __v2df rounded = vec_rint ((__v2df) __A );
1037  long long result = ((__v2df) rounded)[0];
1038 
1039  return result;
1040 }
1041 
1042 /* Microsoft intrinsic. */
1043 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm_cvtsd_si64x (__m128d __A)
1045 {
1046  return _mm_cvtsd_si64 ((__v2df)__A);
1047 }
1048 
1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvttsd_si32 (__m128d __A)
1051 {
1052  int result = ((__v2df)__A)[0];
1053 
1054  return result;
1055 }
1056 
1057 /* Intel intrinsic. */
1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_cvttsd_si64 (__m128d __A)
1060 {
1061  long long result = ((__v2df)__A)[0];
1062 
1063  return result;
1064 }
1065 
1066 /* Microsoft intrinsic. */
1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm_cvttsd_si64x (__m128d __A)
1069 {
1070  return _mm_cvttsd_si64 (__A);
1071 }
1072 
1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1075 {
1076  __v4sf result = (__v4sf)__A;
1077 
1078 #ifdef __LITTLE_ENDIAN__
1079  __v4sf temp_s;
1080  /* Copy double element[0] to element [1] for conversion. */
1081  __v2df temp_b = vec_splat((__v2df)__B, 0);
1082 
1083  /* Pre-rotate __A left 3 (logically right 1) elements. */
1084  result = __builtin_vsx_xxsldwi (result, result, 3);
1085  /* Convert double to single float scalar in a vector. */
1086  __asm__(
1087  "xscvdpsp %x0,%x1"
1088  : "=wa" (temp_s)
1089  : "wa" (temp_b)
1090  : );
1091  /* Shift the resulting scalar into vector element [0]. */
1092  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1093 #else
1094  result [0] = ((__v2df)__B)[0];
1095 #endif
1096  return (__m128) result;
1097 }
1098 
1099 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm_cvtsi32_sd (__m128d __A, int __B)
1101 {
1102  __v2df result = (__v2df)__A;
1103  double db = __B;
1104  result [0] = db;
1105  return (__m128d)result;
1106 }
1107 
1108 /* Intel intrinsic. */
1109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_cvtsi64_sd (__m128d __A, long long __B)
1111 {
1112  __v2df result = (__v2df)__A;
1113  double db = __B;
1114  result [0] = db;
1115  return (__m128d)result;
1116 }
1117 
1118 /* Microsoft intrinsic. */
1119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1121 {
1122  return _mm_cvtsi64_sd (__A, __B);
1123 }
1124 
1125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm_cvtss_sd (__m128d __A, __m128 __B)
1127 {
1128 #ifdef __LITTLE_ENDIAN__
1129  /* Use splat to move element [0] into position for the convert. */
1130  __v4sf temp = vec_splat ((__v4sf)__B, 0);
1131  __v2df res;
1132  /* Convert single float scalar to double in a vector. */
1133  __asm__(
1134  "xscvspdp %x0,%x1"
1135  : "=wa" (res)
1136  : "wa" (temp)
1137  : );
1138  return (__m128d) vec_mergel (res, (__v2df)__A);
1139 #else
1140  __v2df res = (__v2df)__A;
1141  res [0] = ((__v4sf)__B) [0];
1142  return (__m128d) res;
1143 #endif
1144 }
1145 
1146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1149  __vector double result;
1150  const int litmsk = __mask & 0x3;
1151 
1152  if (litmsk == 0)
1153  result = vec_mergeh (__A, __B);
1154 #if __GNUC__ < 6
1155  else if (litmsk == 1)
1156  result = vec_xxpermdi (__B, __A, 2);
1157  else if (litmsk == 2)
1158  result = vec_xxpermdi (__B, __A, 1);
1159 #else
1160  else if (litmsk == 1)
1161  result = vec_xxpermdi (__A, __B, 2);
1162  else if (litmsk == 2)
1163  result = vec_xxpermdi (__A, __B, 1);
1164 #endif
1165  else
1166  result = vec_mergel (__A, __B);
1167 
1168  return result;
1169 }
1170 
1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1173 {
1174  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1175 }
1176 
1177 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1179 {
1180  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1181 }
1182 
1183 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_loadh_pd (__m128d __A, double const *__B)
1185 {
1186  __v2df result = (__v2df)__A;
1187  result [1] = *__B;
1188  return (__m128d)result;
1189 }
1190 
1191 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm_loadl_pd (__m128d __A, double const *__B)
1193 {
1194  __v2df result = (__v2df)__A;
1195  result [0] = *__B;
1196  return (__m128d)result;
1197 }
1198 
1199 #ifdef _ARCH_PWR8
1200 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1201 
1202 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1203 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1204 _mm_movemask_pd (__m128d __A)
1205 {
1206  __vector unsigned long long result;
1207  static const __vector unsigned int perm_mask =
1208  {
1209 #ifdef __LITTLE_ENDIAN__
1210  0x80800040, 0x80808080, 0x80808080, 0x80808080
1211 #else
1212  0x80808080, 0x80808080, 0x80808080, 0x80804000
1213 #endif
1214  };
1215 
1216  result = ((__vector unsigned long long)
1217  vec_vbpermq ((__vector unsigned char) __A,
1218  (__vector unsigned char) perm_mask));
1219 
1220 #ifdef __LITTLE_ENDIAN__
1221  return result[1];
1222 #else
1223  return result[0];
1224 #endif
1225 }
1226 #endif /* _ARCH_PWR8 */
1227 
1228 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229 _mm_packs_epi16 (__m128i __A, __m128i __B)
1230 {
1231  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1232 }
1233 
1234 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_packs_epi32 (__m128i __A, __m128i __B)
1236 {
1237  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1238 }
1239 
1240 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm_packus_epi16 (__m128i __A, __m128i __B)
1242 {
1243  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1244 }
1245 
1246 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1248 {
1249  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1250 }
1251 
1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1254 {
1255  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1256 }
1257 
1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1260 {
1261  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1262 }
1263 
1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1266 {
1267  return (__m128i) vec_mergel ((__vector long long) __A,
1268  (__vector long long) __B);
1269 }
1270 
1271 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1273 {
1274  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1275 }
1276 
1277 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1279 {
1280  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1281 }
1282 
1283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1285 {
1286  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1287 }
1288 
1289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1291 {
1292  return (__m128i) vec_mergeh ((__vector long long) __A,
1293  (__vector long long) __B);
1294 }
1295 
1296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_add_epi8 (__m128i __A, __m128i __B)
1298 {
1299  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1300 }
1301 
1302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_add_epi16 (__m128i __A, __m128i __B)
1304 {
1305  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1306 }
1307 
1308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_add_epi32 (__m128i __A, __m128i __B)
1310 {
1311  return (__m128i) ((__v4su)__A + (__v4su)__B);
1312 }
1313 
1314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_add_epi64 (__m128i __A, __m128i __B)
1316 {
1317  return (__m128i) ((__v2du)__A + (__v2du)__B);
1318 }
1319 
1320 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm_adds_epi8 (__m128i __A, __m128i __B)
1322 {
1323  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1324 }
1325 
1326 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm_adds_epi16 (__m128i __A, __m128i __B)
1328 {
1329  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1330 }
1331 
1332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333 _mm_adds_epu8 (__m128i __A, __m128i __B)
1334 {
1335  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1336 }
1337 
1338 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339 _mm_adds_epu16 (__m128i __A, __m128i __B)
1340 {
1341  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1342 }
1343 
1344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345 _mm_sub_epi8 (__m128i __A, __m128i __B)
1346 {
1347  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1348 }
1349 
1350 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 _mm_sub_epi16 (__m128i __A, __m128i __B)
1352 {
1353  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1354 }
1355 
1356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357 _mm_sub_epi32 (__m128i __A, __m128i __B)
1358 {
1359  return (__m128i) ((__v4su)__A - (__v4su)__B);
1360 }
1361 
1362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm_sub_epi64 (__m128i __A, __m128i __B)
1364 {
1365  return (__m128i) ((__v2du)__A - (__v2du)__B);
1366 }
1367 
1368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_subs_epi8 (__m128i __A, __m128i __B)
1370 {
1371  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1372 }
1373 
1374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm_subs_epi16 (__m128i __A, __m128i __B)
1376 {
1377  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1378 }
1379 
1380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_subs_epu8 (__m128i __A, __m128i __B)
1382 {
1383  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1384 }
1385 
1386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_subs_epu16 (__m128i __A, __m128i __B)
1388 {
1389  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1390 }
1391 
1392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393 _mm_madd_epi16 (__m128i __A, __m128i __B)
1394 {
1395  __vector signed int zero = {0, 0, 0, 0};
1396 
1397  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1398 }
1399 
1400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1402 {
1403  __vector signed int w0, w1;
1404 
1405  __vector unsigned char xform1 = {
1406 #ifdef __LITTLE_ENDIAN__
1407  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1408  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1409 #else
1410  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1411  0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1412 #endif
1413  };
1414 
1415  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1416  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1417  return (__m128i) vec_perm (w0, w1, xform1);
1418 }
1419 
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1422 {
1423  return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1424 }
1425 
1426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm_mul_su32 (__m64 __A, __m64 __B)
1428 {
1429  unsigned int a = __A;
1430  unsigned int b = __B;
1431 
1432  return ((__m64)a * (__m64)b);
1433 }
1434 
1435 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _mm_mul_epu32 (__m128i __A, __m128i __B)
1437 {
1438 #if __GNUC__ < 8
1439  __v2du result;
1440 
1441 #ifdef __LITTLE_ENDIAN__
1442  /* VMX Vector Multiply Odd Unsigned Word. */
1443  __asm__(
1444  "vmulouw %0,%1,%2"
1445  : "=v" (result)
1446  : "v" (__A), "v" (__B)
1447  : );
1448 #else
1449  /* VMX Vector Multiply Even Unsigned Word. */
1450  __asm__(
1451  "vmuleuw %0,%1,%2"
1452  : "=v" (result)
1453  : "v" (__A), "v" (__B)
1454  : );
1455 #endif
1456  return (__m128i) result;
1457 #else
1458  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1459 #endif
1460 }
1461 
1462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 _mm_slli_epi16 (__m128i __A, int __B)
1464 {
1465  __v8hu lshift;
1466  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1467 
1468  if (__B >= 0 && __B < 16)
1469  {
1470  if (__builtin_constant_p(__B))
1471  lshift = (__v8hu) vec_splat_s16(__B);
1472  else
1473  lshift = vec_splats ((unsigned short) __B);
1474 
1475  result = vec_sl ((__v8hi) __A, lshift);
1476  }
1477 
1478  return (__m128i) result;
1479 }
1480 
1481 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482 _mm_slli_epi32 (__m128i __A, int __B)
1483 {
1484  __v4su lshift;
1485  __v4si result = { 0, 0, 0, 0 };
1486 
1487  if (__B >= 0 && __B < 32)
1488  {
1489  if (__builtin_constant_p(__B) && __B < 16)
1490  lshift = (__v4su) vec_splat_s32(__B);
1491  else
1492  lshift = vec_splats ((unsigned int) __B);
1493 
1494  result = vec_sl ((__v4si) __A, lshift);
1495  }
1496 
1497  return (__m128i) result;
1498 }
1499 
1500 #ifdef _ARCH_PWR8
1501 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _mm_slli_epi64 (__m128i __A, int __B)
1503 {
1504  __v2du lshift;
1505  __v2di result = { 0, 0 };
1506 
1507  if (__B >= 0 && __B < 64)
1508  {
1509  if (__builtin_constant_p(__B) && __B < 16)
1510  lshift = (__v2du) vec_splat_s32(__B);
1511  else
1512  lshift = (__v2du) vec_splats ((unsigned int) __B);
1513 
1514  result = vec_sl ((__v2di) __A, lshift);
1515  }
1516 
1517  return (__m128i) result;
1518 }
1519 #endif
1520 
1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 _mm_srai_epi16 (__m128i __A, int __B)
1523 {
1524  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1525  __v8hi result;
1526 
1527  if (__B < 16)
1528  {
1529  if (__builtin_constant_p(__B))
1530  rshift = (__v8hu) vec_splat_s16(__B);
1531  else
1532  rshift = vec_splats ((unsigned short) __B);
1533  }
1534  result = vec_sra ((__v8hi) __A, rshift);
1535 
1536  return (__m128i) result;
1537 }
1538 
1539 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540 _mm_srai_epi32 (__m128i __A, int __B)
1541 {
1542  __v4su rshift = { 31, 31, 31, 31 };
1543  __v4si result;
1544 
1545  if (__B < 32)
1546  {
1547  if (__builtin_constant_p(__B))
1548  {
1549  if (__B < 16)
1550  rshift = (__v4su) vec_splat_s32(__B);
1551  else
1552  rshift = (__v4su) vec_splats((unsigned int)__B);
1553  }
1554  else
1555  rshift = vec_splats ((unsigned int) __B);
1556  }
1557  result = vec_sra ((__v4si) __A, rshift);
1558 
1559  return (__m128i) result;
1560 }
1561 
1562 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563 _mm_bslli_si128 (__m128i __A, const int __N)
1565  __v16qu result;
1566  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567 
1568  if (__N < 16)
1569  result = vec_sld ((__v16qu) __A, zeros, __N);
1570  else
1571  result = zeros;
1572 
1573  return (__m128i) result;
1574 }
1575 
1576 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 _mm_bsrli_si128 (__m128i __A, const int __N)
1578 {
1579  __v16qu result;
1580  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1581 
1582  if (__N < 16)
1583 #ifdef __LITTLE_ENDIAN__
1584  if (__builtin_constant_p(__N))
1585  /* Would like to use Vector Shift Left Double by Octet
1586  Immediate here to use the immediate form and avoid
1587  load of __N * 8 value into a separate VR. */
1588  result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1589  else
1590 #endif
1591  {
1592  __v16qu shift = vec_splats((unsigned char)(__N*8));
1593 #ifdef __LITTLE_ENDIAN__
1594  result = vec_sro ((__v16qu)__A, shift);
1595 #else
1596  result = vec_slo ((__v16qu)__A, shift);
1597 #endif
1598  }
1599  else
1600  result = zeros;
1601 
1602  return (__m128i) result;
1603 }
1604 
1605 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606 _mm_srli_si128 (__m128i __A, const int __N)
1607 {
1608  return _mm_bsrli_si128 (__A, __N);
1609 }
1610 
1611 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1612 _mm_slli_si128 (__m128i __A, const int _imm5)
1614  __v16qu result;
1615  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1616 
1617  if (_imm5 < 16)
1618 #ifdef __LITTLE_ENDIAN__
1619  result = vec_sld ((__v16qu) __A, zeros, _imm5);
1620 #else
1621  result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1622 #endif
1623  else
1624  result = zeros;
1625 
1626  return (__m128i) result;
1627 }
1628 
1629 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1630 
1631 _mm_srli_epi16 (__m128i __A, int __B)
1632 {
1633  __v8hu rshift;
1634  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1635 
1636  if (__B < 16)
1637  {
1638  if (__builtin_constant_p(__B))
1639  rshift = (__v8hu) vec_splat_s16(__B);
1640  else
1641  rshift = vec_splats ((unsigned short) __B);
1642 
1643  result = vec_sr ((__v8hi) __A, rshift);
1644  }
1645 
1646  return (__m128i) result;
1647 }
1648 
1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 _mm_srli_epi32 (__m128i __A, int __B)
1651 {
1652  __v4su rshift;
1653  __v4si result = { 0, 0, 0, 0 };
1654 
1655  if (__B < 32)
1656  {
1657  if (__builtin_constant_p(__B))
1658  {
1659  if (__B < 16)
1660  rshift = (__v4su) vec_splat_s32(__B);
1661  else
1662  rshift = (__v4su) vec_splats((unsigned int)__B);
1663  }
1664  else
1665  rshift = vec_splats ((unsigned int) __B);
1666 
1667  result = vec_sr ((__v4si) __A, rshift);
1668  }
1669 
1670  return (__m128i) result;
1671 }
1672 
1673 #ifdef _ARCH_PWR8
1674 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 _mm_srli_epi64 (__m128i __A, int __B)
1676 {
1677  __v2du rshift;
1678  __v2di result = { 0, 0 };
1679 
1680  if (__B < 64)
1681  {
1682  if (__builtin_constant_p(__B))
1683  {
1684  if (__B < 16)
1685  rshift = (__v2du) vec_splat_s32(__B);
1686  else
1687  rshift = (__v2du) vec_splats((unsigned long long)__B);
1688  }
1689  else
1690  rshift = (__v2du) vec_splats ((unsigned int) __B);
1691 
1692  result = vec_sr ((__v2di) __A, rshift);
1693  }
1694 
1695  return (__m128i) result;
1696 }
1697 #endif
1698 
1699 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700 _mm_sll_epi16 (__m128i __A, __m128i __B)
1701 {
1702  __v8hu lshift;
1703  __vector __bool short shmask;
1704  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1705  __v8hu result;
1706 
1707 #ifdef __LITTLE_ENDIAN__
1708  lshift = vec_splat ((__v8hu) __B, 0);
1709 #else
1710  lshift = vec_splat ((__v8hu) __B, 3);
1711 #endif
1712  shmask = vec_cmple (lshift, shmax);
1713  result = vec_sl ((__v8hu) __A, lshift);
1714  result = vec_sel ((__v8hu) shmask, result, shmask);
1715 
1716  return (__m128i) result;
1717 }
1718 
1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720 _mm_sll_epi32 (__m128i __A, __m128i __B)
1721 {
1722  __v4su lshift;
1723  __vector __bool int shmask;
1724  const __v4su shmax = { 32, 32, 32, 32 };
1725  __v4su result;
1726 #ifdef __LITTLE_ENDIAN__
1727  lshift = vec_splat ((__v4su) __B, 0);
1728 #else
1729  lshift = vec_splat ((__v4su) __B, 1);
1730 #endif
1731  shmask = vec_cmplt (lshift, shmax);
1732  result = vec_sl ((__v4su) __A, lshift);
1733  result = vec_sel ((__v4su) shmask, result, shmask);
1734 
1735  return (__m128i) result;
1736 }
1737 
1738 #ifdef _ARCH_PWR8
1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740 _mm_sll_epi64 (__m128i __A, __m128i __B)
1741 {
1742  __v2du lshift;
1743  __vector __bool long long shmask;
1744  const __v2du shmax = { 64, 64 };
1745  __v2du result;
1746 
1747  lshift = vec_splat ((__v2du) __B, 0);
1748  shmask = vec_cmplt (lshift, shmax);
1749  result = vec_sl ((__v2du) __A, lshift);
1750  result = vec_sel ((__v2du) shmask, result, shmask);
1751 
1752  return (__m128i) result;
1753 }
1754 #endif
1755 
1756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_sra_epi16 (__m128i __A, __m128i __B)
1758 {
1759  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1761  __v8hi result;
1762 
1763 #ifdef __LITTLE_ENDIAN__
1764  rshift = vec_splat ((__v8hu)__B, 0);
1765 #else
1766  rshift = vec_splat ((__v8hu)__B, 3);
1767 #endif
1768  rshift = vec_min (rshift, rshmax);
1769  result = vec_sra ((__v8hi) __A, rshift);
1770 
1771  return (__m128i) result;
1772 }
1773 
1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775 _mm_sra_epi32 (__m128i __A, __m128i __B)
1776 {
1777  const __v4su rshmax = { 31, 31, 31, 31 };
1778  __v4su rshift;
1779  __v4si result;
1780 
1781 #ifdef __LITTLE_ENDIAN__
1782  rshift = vec_splat ((__v4su)__B, 0);
1783 #else
1784  rshift = vec_splat ((__v4su)__B, 1);
1785 #endif
1786  rshift = vec_min (rshift, rshmax);
1787  result = vec_sra ((__v4si) __A, rshift);
1788 
1789  return (__m128i) result;
1790 }
1791 
1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1793 _mm_srl_epi16 (__m128i __A, __m128i __B)
1794 {
1795  __v8hu rshift;
1796  __vector __bool short shmask;
1797  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1798  __v8hu result;
1799 
1800 #ifdef __LITTLE_ENDIAN__
1801  rshift = vec_splat ((__v8hu) __B, 0);
1802 #else
1803  rshift = vec_splat ((__v8hu) __B, 3);
1804 #endif
1805  shmask = vec_cmple (rshift, shmax);
1806  result = vec_sr ((__v8hu) __A, rshift);
1807  result = vec_sel ((__v8hu) shmask, result, shmask);
1808 
1809  return (__m128i) result;
1810 }
1811 
1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813 _mm_srl_epi32 (__m128i __A, __m128i __B)
1814 {
1815  __v4su rshift;
1816  __vector __bool int shmask;
1817  const __v4su shmax = { 32, 32, 32, 32 };
1818  __v4su result;
1819 
1820 #ifdef __LITTLE_ENDIAN__
1821  rshift = vec_splat ((__v4su) __B, 0);
1822 #else
1823  rshift = vec_splat ((__v4su) __B, 1);
1824 #endif
1825  shmask = vec_cmplt (rshift, shmax);
1826  result = vec_sr ((__v4su) __A, rshift);
1827  result = vec_sel ((__v4su) shmask, result, shmask);
1828 
1829  return (__m128i) result;
1830 }
1831 
1832 #ifdef _ARCH_PWR8
1833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1834 _mm_srl_epi64 (__m128i __A, __m128i __B)
1835 {
1836  __v2du rshift;
1837  __vector __bool long long shmask;
1838  const __v2du shmax = { 64, 64 };
1839  __v2du result;
1840 
1841  rshift = vec_splat ((__v2du) __B, 0);
1842  shmask = vec_cmplt (rshift, shmax);
1843  result = vec_sr ((__v2du) __A, rshift);
1844  result = vec_sel ((__v2du) shmask, result, shmask);
1845 
1846  return (__m128i) result;
1847 }
1848 #endif
1849 
1850 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851 _mm_and_pd (__m128d __A, __m128d __B)
1852 {
1853  return (vec_and ((__v2df) __A, (__v2df) __B));
1854 }
1855 
1856 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857 _mm_andnot_pd (__m128d __A, __m128d __B)
1858 {
1859  return (vec_andc ((__v2df) __B, (__v2df) __A));
1860 }
1861 
1862 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_or_pd (__m128d __A, __m128d __B)
1864 {
1865  return (vec_or ((__v2df) __A, (__v2df) __B));
1866 }
1867 
1868 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_xor_pd (__m128d __A, __m128d __B)
1870 {
1871  return (vec_xor ((__v2df) __A, (__v2df) __B));
1872 }
1873 
1874 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_and_si128 (__m128i __A, __m128i __B)
1876 {
1877  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1878 }
1879 
1880 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_andnot_si128 (__m128i __A, __m128i __B)
1882 {
1883  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1884 }
1885 
1886 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_or_si128 (__m128i __A, __m128i __B)
1888 {
1889  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1890 }
1891 
1892 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_xor_si128 (__m128i __A, __m128i __B)
1894 {
1895  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1896 }
1897 
1898 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1900 {
1901  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1902 }
1903 
1904 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1906 {
1907  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1908 }
1909 
1910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1912 {
1913  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1914 }
1915 
1916 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1918 {
1919  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1920 }
1921 
1922 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1924 {
1925  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1926 }
1927 
1928 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1930 {
1931  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1932 }
1933 
1934 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1936 {
1937  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1938 }
1939 
1940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1942 {
1943  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1944 }
1945 
1946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1948 {
1949  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1950 }
1951 
1952 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1953 _mm_extract_epi16 (__m128i const __A, int const __N)
1954 {
1955  return (unsigned short) ((__v8hi)__A)[__N & 7];
1956 }
1957 
1958 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1959 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1960 {
1961  __v8hi result = (__v8hi)__A;
1962 
1963  result [(__N & 7)] = __D;
1964 
1965  return (__m128i) result;
1966 }
1967 
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969 _mm_max_epi16 (__m128i __A, __m128i __B)
1970 {
1971  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1972 }
1973 
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975 _mm_max_epu8 (__m128i __A, __m128i __B)
1976 {
1977  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1978 }
1979 
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981 _mm_min_epi16 (__m128i __A, __m128i __B)
1982 {
1983  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1984 }
1985 
1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987 _mm_min_epu8 (__m128i __A, __m128i __B)
1988 {
1989  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1990 }
1991 
1992 
1993 #ifdef _ARCH_PWR8
1994 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1995 
1996 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1998 _mm_movemask_epi8 (__m128i __A)
1999 {
2000  __vector unsigned long long result;
2001  static const __vector unsigned char perm_mask =
2002  {
2003  0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2004  0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2005  };
2006 
2007  result = ((__vector unsigned long long)
2008  vec_vbpermq ((__vector unsigned char) __A,
2009  (__vector unsigned char) perm_mask));
2010 
2011 #ifdef __LITTLE_ENDIAN__
2012  return result[1];
2013 #else
2014  return result[0];
2015 #endif
2016 }
2017 #endif /* _ARCH_PWR8 */
2018 
2019 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2020 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2021 {
2022  __v4su w0, w1;
2023  __v16qu xform1 = {
2024 #ifdef __LITTLE_ENDIAN__
2025  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2026  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2027 #else
2028  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2029  0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2030 #endif
2031  };
2032 
2033  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2034  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2035  return (__m128i) vec_perm (w0, w1, xform1);
2036 }
2037 
2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2039 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2040 {
2041  unsigned long element_selector_98 = __mask & 0x03;
2042  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2043  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2044  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2045  static const unsigned short permute_selectors[4] =
2046  {
2047 #ifdef __LITTLE_ENDIAN__
2048  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2049 #else
2050  0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2051 #endif
2052  };
2054 #ifdef __LITTLE_ENDIAN__
2055  { 0x1716151413121110UL, 0UL};
2056 #else
2057  { 0x1011121314151617UL, 0UL};
2058 #endif
2060  __v2du a, r;
2061 
2062  t.as_short[0] = permute_selectors[element_selector_98];
2063  t.as_short[1] = permute_selectors[element_selector_BA];
2064  t.as_short[2] = permute_selectors[element_selector_DC];
2065  t.as_short[3] = permute_selectors[element_selector_FE];
2066  pmask[1] = t.as_m64;
2067  a = (__v2du)__A;
2068  r = vec_perm (a, a, (__vector unsigned char)pmask);
2069  return (__m128i) r;
2070 }
2071 
2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2074 {
2075  unsigned long element_selector_10 = __mask & 0x03;
2076  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2077  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2078  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2079  static const unsigned short permute_selectors[4] =
2080  {
2081 #ifdef __LITTLE_ENDIAN__
2082  0x0100, 0x0302, 0x0504, 0x0706
2083 #else
2084  0x0001, 0x0203, 0x0405, 0x0607
2085 #endif
2086  };
2087  __v2du pmask =
2088 #ifdef __LITTLE_ENDIAN__
2089  { 0UL, 0x1f1e1d1c1b1a1918UL};
2090 #else
2091  { 0UL, 0x18191a1b1c1d1e1fUL};
2092 #endif
2093  __m64_union t;
2094  __v2du a, r;
2095  t.as_short[0] = permute_selectors[element_selector_10];
2096  t.as_short[1] = permute_selectors[element_selector_32];
2097  t.as_short[2] = permute_selectors[element_selector_54];
2098  t.as_short[3] = permute_selectors[element_selector_76];
2099  pmask[0] = t.as_m64;
2100  a = (__v2du)__A;
2101  r = vec_perm (a, a, (__vector unsigned char)pmask);
2102  return (__m128i) r;
2103 }
2104 
2105 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2107 {
2108  unsigned long element_selector_10 = __mask & 0x03;
2109  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2110  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2111  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2112  static const unsigned int permute_selectors[4] =
2113  {
2114 #ifdef __LITTLE_ENDIAN__
2115  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2116 #else
2117  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2118 #endif
2119  };
2120  __v4su t;
2121 
2122  t[0] = permute_selectors[element_selector_10];
2123  t[1] = permute_selectors[element_selector_32];
2124  t[2] = permute_selectors[element_selector_54] + 0x10101010;
2125  t[3] = permute_selectors[element_selector_76] + 0x10101010;
2126  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2127 }
2128 
2129 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2130 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2132  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2134  __m128i_u *p = (__m128i_u*)__C;
2135 
2136  tmp = (__v16qu)_mm_loadu_si128(p);
2137  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2138  tmp = vec_sel (tmp, (__v16qu)__A, mask);
2139  _mm_storeu_si128 (p, (__m128i)tmp);
2140 }
2141 
2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143 _mm_avg_epu8 (__m128i __A, __m128i __B)
2144 {
2145  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2146 }
2147 
2148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2149 _mm_avg_epu16 (__m128i __A, __m128i __B)
2150 {
2151  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2152 }
2153 
2154 
2155 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156 _mm_sad_epu8 (__m128i __A, __m128i __B)
2157 {
2158  __v16qu a, b;
2161  const __v4su zero = { 0, 0, 0, 0 };
2162  __v4si result;
2163 
2164  a = (__v16qu) __A;
2165  b = (__v16qu) __B;
2166  vmin = vec_min (a, b);
2167  vmax = vec_max (a, b);
2168  vabsdiff = vec_sub (vmax, vmin);
2169  /* Sum four groups of bytes into integers. */
2170  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2171  /* Sum across four integers with two integer results. */
2172  result = vec_sum2s (vsum, (__vector signed int) zero);
2173  /* Rotate the sums into the correct position. */
2174 #ifdef __LITTLE_ENDIAN__
2175  result = vec_sld (result, result, 4);
2176 #else
2177  result = vec_sld (result, result, 6);
2178 #endif
2179  /* Rotate the sums into the correct position. */
2180  return (__m128i) result;
2181 }
2182 
2183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2184 _mm_stream_si32 (int *__A, int __B)
2185 {
2186  /* Use the data cache block touch for store transient. */
2187  __asm__ (
2188  "dcbtstt 0,%0"
2189  :
2190  : "b" (__A)
2191  : "memory"
2192  );
2193  *__A = __B;
2194 }
2195 
2196 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2197 _mm_stream_si64 (long long int *__A, long long int __B)
2198 {
2199  /* Use the data cache block touch for store transient. */
2200  __asm__ (
2201  " dcbtstt 0,%0"
2202  :
2203  : "b" (__A)
2204  : "memory"
2205  );
2206  *__A = __B;
2207 }
2208 
2209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2210 _mm_stream_si128 (__m128i *__A, __m128i __B)
2211 {
2212  /* Use the data cache block touch for store transient. */
2213  __asm__ (
2214  "dcbtstt 0,%0"
2215  :
2216  : "b" (__A)
2217  : "memory"
2218  );
2219  *__A = __B;
2220 }
2221 
2222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2223 _mm_stream_pd (double *__A, __m128d __B)
2224 {
2225  /* Use the data cache block touch for store transient. */
2226  __asm__ (
2227  "dcbtstt 0,%0"
2228  :
2229  : "b" (__A)
2230  : "memory"
2231  );
2232  *(__m128d*)__A = __B;
2233 }
2234 
2235 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236 _mm_clflush (void const *__A)
2237 {
2238  /* Use the data cache block flush. */
2239  __asm__ (
2240  "dcbf 0,%0"
2241  :
2242  : "b" (__A)
2243  : "memory"
2244  );
2245 }
2246 
2247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248 _mm_lfence (void)
2249 {
2250  /* Use light weight sync for load to load ordering. */
2251  __atomic_thread_fence (__ATOMIC_RELEASE);
2252 }
2253 
2254 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2255 _mm_mfence (void)
2256 {
2257  /* Use heavy weight sync for any to any ordering. */
2258  __atomic_thread_fence (__ATOMIC_SEQ_CST);
2259 }
2260 
2261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2262 _mm_cvtsi32_si128 (int __A)
2263 {
2264  return _mm_set_epi32 (0, 0, 0, __A);
2265 }
2266 
2267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2268 _mm_cvtsi64_si128 (long long __A)
2269 {
2270  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2271 }
2272 
2273 /* Microsoft intrinsic. */
2274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2275 _mm_cvtsi64x_si128 (long long __A)
2276 {
2277  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2278 }
2279 
2280 /* Casts between various SP, DP, INT vector types. Note that these do no
2281  conversion of values, they just change the type. */
2282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283 _mm_castpd_ps(__m128d __A)
2284 {
2285  return (__m128) __A;
2286 }
2287 
2288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289 _mm_castpd_si128(__m128d __A)
2290 {
2291  return (__m128i) __A;
2292 }
2293 
2294 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295 _mm_castps_pd(__m128 __A)
2296 {
2297  return (__m128d) __A;
2298 }
2299 
2300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2301 _mm_castps_si128(__m128 __A)
2302 {
2303  return (__m128i) __A;
2304 }
2305 
2306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2307 _mm_castsi128_ps(__m128i __A)
2308 {
2309  return (__m128) __A;
2310 }
2311 
2312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2313 _mm_castsi128_pd(__m128i __A)
2314 {
2315  return (__m128d) __A;
2316 }
2317 
2318 #endif /* EMMINTRIN_H_ */
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:488
static const unsigned short permute_selectors[4]
Definition: emmintrin.h:2045
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2878
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:11472
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3057
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2823
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1375
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7014
__inline __m128i int const __D
Definition: emmintrin.h:1959
__inline __m128i char char char char char char char char char char char __q04
Definition: emmintrin.h:682
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1250
__inline __m128i char __q14
Definition: emmintrin.h:682
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition: altivec.h:3446
unsigned long element_selector_DC
Definition: emmintrin.h:2043
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3699
const __v2du double_exp_mask
Definition: emmintrin.h:414
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7143
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2084
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double]...
Definition: emmintrin.h:244
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3963
unsigned long element_selector_BA
Definition: emmintrin.h:2042
return vec_perm((__v4sf) __A,(__v4sf) __B,(__vector unsigned char) t)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2955
__v8hi zero
Definition: emmintrin.h:1397
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1198
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1473
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value...
Definition: emmintrin.h:3785
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3095
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3885
__vector signed char __v16qi
Definition: emmintrin.h:51
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4303
__m64_union
Definition: mmintrin.h:52
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value...
Definition: emmintrin.h:3766
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1819
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044
_mm_storeu_si128(p,(__m128i) tmp)
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:3040
return vec_sel(__B, __A, m)
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2092
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4913
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:4746
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1837
__inline __m128i char char char char char char char char char char char char char __q02
Definition: emmintrin.h:682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2161
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2840
unsigned long element_selector_FE
Definition: emmintrin.h:2044
__m128i_u * p
Definition: emmintrin.h:2134
__inline __m128i long long __q0
Definition: emmintrin.h:657
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:759
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1096
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2643
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:312
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3133
__inline __m128i char char char char char char __q09
Definition: emmintrin.h:682
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:560
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1224
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4797
__v8hu rshift
Definition: emmintrin.h:1760
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1295
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3548
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2570
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1358
__inline __m128 const float __Y
Definition: xmmintrin.h:131
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6040
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4492
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:9181
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2511
lshift
Definition: emmintrin.h:1710
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1778
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2725
__inline __m128i char char char char __q11
Definition: emmintrin.h:682
__inline __m128 __m64 const * __P
Definition: xmmintrin.h:1269
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4575
__inline __m128i short short short short short short short __q7
Definition: emmintrin.h:742
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3190
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:640
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location...
Definition: emmintrin.h:4104
__vector unsigned char __v16qu
Definition: emmintrin.h:52
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3631
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4930
c
Definition: emmintrin.h:306
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:810
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4358
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1276
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:268
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:56
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3842
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:785
w0
Definition: emmintrin.h:1415
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4737
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:12164
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4720
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1545
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10014
const int litmsk
Definition: emmintrin.h:1150
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1641
const __v16qu zeros
Definition: emmintrin.h:1566
__inline __m128d double __X
Definition: emmintrin.h:89
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2935
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1597
__inline __m128i short short __q5
Definition: emmintrin.h:674
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4862
b
Definition: emmintrin.h:321
__tmp
Definition: xmmintrin.h:166
__v16qu vmax
Definition: emmintrin.h:2159
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3336
__v16qu mask
Definition: emmintrin.h:2133
__vector int __v4si
Definition: emmintrin.h:47
__inline __m128i char char char char char char char char __q07
Definition: emmintrin.h:682
__inline __m128i char char char __q12
Definition: emmintrin.h:682
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8029
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:684
__inline __m128i char char char char char char char __q08
Definition: emmintrin.h:682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3609
__vector unsigned char xform1
Definition: emmintrin.h:1405
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors...
Definition: emmintrin.h:2244
__asm__("vmuleuw %0,%1,%2" :"=v"(result) :"v"(__A), "v"(__B) :)
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location...
Definition: emmintrin.h:1972
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1172
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers...
Definition: emmintrin.h:4275
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:11173
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2995
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2606
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1528
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1724
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3273
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1896
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1606
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4846
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3461
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:661
return() __m64((__vector long long) c)[0]
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6176
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3532
__v2du d
Definition: emmintrin.h:413
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2054
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1163
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1911
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:117
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:15
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4896
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:181
__vector unsigned long long __v2du
Definition: emmintrin.h:46
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1400
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:8975
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1315
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1625
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2201
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1855
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:709
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:13651
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:467
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:10904
__inline __m128i char char char char char char char char char __q06
Definition: emmintrin.h:682
__inline void __m128d __A
Definition: emmintrin.h:169
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:9574
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3445
double db
Definition: emmintrin.h:1103
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1875
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2916
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1146
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2412
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1579
unsigned long element_selector_54
Definition: emmintrin.h:2077
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:4711
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2975
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:941
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:75
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:813
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4223
unsigned long element_selector_32
Definition: emmintrin.h:2076
__inline __m128d __m128d __B
Definition: emmintrin.h:118
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:8250
#define vec_cts
Definition: altivec.h:2922
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:967
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2117
__inline __m128i int int __q1
Definition: emmintrin.h:668
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors...
Definition: emmintrin.h:2222
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3209
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4755
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:389
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1950
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:916
#define vec_ctf(__a, __b)
Definition: altivec.h:2893
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter...
Definition: emmintrin.h:4382
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3229
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:598
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:201
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4703
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:226
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3659
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3252
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2139
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4776
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:1968
r
Definition: emmintrin.h:563
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand...
Definition: emmintrin.h:3569
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2432
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:11609
__inline __m128d __m128d const int __mask
Definition: emmintrin.h:1148
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:2031
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1562
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:139
__inline __m128i char char char char char char char char char char char char char char char __q00
Definition: emmintrin.h:686
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:3015
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:98
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:350
__inline __m128 const float const float const float __W
Definition: xmmintrin.h:132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4879
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4462
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1122
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2492
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value...
Definition: emmintrin.h:1491
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4123
__vector __bool short shmask
Definition: emmintrin.h:1703
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2743
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value...
Definition: emmintrin.h:3496
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3747
__inline __m128i char char char char char char char char char char __q05
Definition: emmintrin.h:682
w1
Definition: emmintrin.h:1416
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3114
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4682
__m64_union t
Definition: emmintrin.h:2059
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:891
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4947
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:288
__vector double __v2df
Definition: emmintrin.h:44
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2372
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:9444
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2392
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:407
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:1908
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3862
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2286
__inline void __m128i char * __C
Definition: emmintrin.h:2131
__vector unsigned int __v4su
Definition: emmintrin.h:48
__v16qu tmp
Definition: emmintrin.h:2133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4631
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2072
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1423
__vector short __v8hi
Definition: emmintrin.h:49
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3977
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2530
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one&#39;s complement of the valu...
Definition: emmintrin.h:371
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4596
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:734
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2352
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value...
Definition: emmintrin.h:3804
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508
__inline __m128i short short short short __q3
Definition: emmintrin.h:674
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4659
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition: altivec.h:10219
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1514
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2685
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2452
__v16qu vmin
Definition: emmintrin.h:2159
__inline __m128i char char char char char __q10
Definition: emmintrin.h:682
__inline __m128i const int __N
Definition: emmintrin.h:1564
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4331
__inline __m128i char char __q13
Definition: emmintrin.h:682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2705
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3995
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3294
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes. ...
Definition: emmintrin.h:3037
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:5638
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3587
__inline __m128i char char char char char char char char char char char char __q03
Definition: emmintrin.h:682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors...
Definition: emmintrin.h:2552
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1070
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2472
__inline __m128i short short short __q4
Definition: emmintrin.h:674
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1018
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4432
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4525
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3152
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location...
Definition: emmintrin.h:4146
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value...
Definition: emmintrin.h:3823
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1932
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:841
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:866
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [8 x i16] vectors...
Definition: emmintrin.h:2326
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:446
const __v8hu shmax
Definition: emmintrin.h:1704
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3076
__v2du pmask
Definition: emmintrin.h:2053
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:9484
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits...
Definition: emmintrin.h:2859
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double]...
Definition: emmintrin.h:1751
__vector long long __v2di
Definition: emmintrin.h:45
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:577
__v4si vsum
Definition: emmintrin.h:2160
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3428
__inline __m128i char char char char char char char char char char char char char char __q01
Definition: emmintrin.h:682
__v16qu vabsdiff
Definition: emmintrin.h:2159
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2588
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:332
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float]...
Definition: emmintrin.h:1449
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2798
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double]...
Definition: emmintrin.h:4816
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2664
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:115
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2780
__inline __m128i int __q2
Definition: emmintrin.h:668
__inline __m128i short __q6
Definition: emmintrin.h:674
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:11748
__vector unsigned short __v8hu
Definition: emmintrin.h:50
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4399
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:158
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1623
a
Definition: emmintrin.h:320
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1992
result[0]
Definition: emmintrin.h:120
__inline __m128i char char char char char char char char char char char char char char char __q15
Definition: emmintrin.h:751
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:4476
__inline __m128i const int _imm5
Definition: emmintrin.h:1613
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3412
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:9428
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2820
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5127
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded avarages of corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2306
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors...
Definition: emmintrin.h:2265
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1799
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:2013
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:619
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one&#39;s complement of the values conta...
Definition: emmintrin.h:2763
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality...
Definition: emmintrin.h:3171
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4552
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:556
unsigned long element_selector_76
Definition: emmintrin.h:2078
res[0]
Definition: emmintrin.h:1141