clang 19.0.0git
emmintrin.h
Go to the documentation of this file.
1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
31*/
32#error \
33 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34#endif
35
36#ifndef EMMINTRIN_H_
37#define EMMINTRIN_H_
38
39#if defined(__powerpc64__) && \
40 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41
42#include <altivec.h>
43
44/* We need definitions from the SSE header files. */
45#include <xmmintrin.h>
46
47/* SSE2 */
48typedef __vector double __v2df;
49typedef __vector float __v4f;
50typedef __vector long long __v2di;
51typedef __vector unsigned long long __v2du;
52typedef __vector int __v4si;
53typedef __vector unsigned int __v4su;
54typedef __vector short __v8hi;
55typedef __vector unsigned short __v8hu;
56typedef __vector signed char __v16qi;
57typedef __vector unsigned char __v16qu;
58
59/* The Intel API is flexible enough that we must allow aliasing with other
60 vector types, and their scalar components. */
61typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
62typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
63
64/* Unaligned version of the same types. */
65typedef long long __m128i_u
66 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67typedef double __m128d_u
68 __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
69
70/* Define two value permute mask. */
71#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
72
73/* Create a vector with element 0 as F and the rest zero. */
74extern __inline __m128d
75 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76 _mm_set_sd(double __F) {
77 return __extension__(__m128d){__F, 0.0};
78}
79
80/* Create a vector with both elements equal to F. */
81extern __inline __m128d
82 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_set1_pd(double __F) {
84 return __extension__(__m128d){__F, __F};
85}
86
87extern __inline __m128d
88 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 _mm_set_pd1(double __F) {
90 return _mm_set1_pd(__F);
91}
92
93/* Create a vector with the lower value X and upper value W. */
94extern __inline __m128d
95 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
96 _mm_set_pd(double __W, double __X) {
97 return __extension__(__m128d){__X, __W};
98}
99
100/* Create a vector with the lower value W and upper value X. */
101extern __inline __m128d
102 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
103 _mm_setr_pd(double __W, double __X) {
104 return __extension__(__m128d){__W, __X};
105}
106
107/* Create an undefined vector. */
108extern __inline __m128d
109 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_undefined_pd(void) {
111 __m128d __Y = __Y;
112 return __Y;
113}
114
115/* Create a vector of zeros. */
116extern __inline __m128d
117 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
118 _mm_setzero_pd(void) {
119 return (__m128d)vec_splats(0);
120}
121
122/* Sets the low DPFP value of A from the low value of B. */
123extern __inline __m128d
124 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm_move_sd(__m128d __A, __m128d __B) {
126 __v2df __result = (__v2df)__A;
127 __result[0] = ((__v2df)__B)[0];
128 return (__m128d)__result;
129}
130
131/* Load two DPFP values from P. The address must be 16-byte aligned. */
132extern __inline __m128d
133 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm_load_pd(double const *__P) {
135 return ((__m128d)vec_ld(0, (__v16qu *)__P));
136}
137
138/* Load two DPFP values from P. The address need not be 16-byte aligned. */
139extern __inline __m128d
140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_loadu_pd(double const *__P) {
142 return (vec_vsx_ld(0, __P));
143}
144
145/* Create a vector with all two elements equal to *P. */
146extern __inline __m128d
147 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 _mm_load1_pd(double const *__P) {
149 return (vec_splats(*__P));
150}
151
152/* Create a vector with element 0 as *P and the rest zero. */
153extern __inline __m128d
154 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_load_sd(double const *__P) {
156 return _mm_set_sd(*__P);
157}
158
159extern __inline __m128d
160 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_load_pd1(double const *__P) {
162 return _mm_load1_pd(__P);
163}
164
165/* Load two DPFP values in reverse order. The address must be aligned. */
166extern __inline __m128d
167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm_loadr_pd(double const *__P) {
169 __v2df __tmp = _mm_load_pd(__P);
170 return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
171}
172
173/* Store two DPFP values. The address must be 16-byte aligned. */
174extern __inline void
175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_store_pd(double *__P, __m128d __A) {
177 vec_st((__v16qu)__A, 0, (__v16qu *)__P);
178}
179
180/* Store two DPFP values. The address need not be 16-byte aligned. */
181extern __inline void
182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
183 _mm_storeu_pd(double *__P, __m128d __A) {
184 *(__m128d_u *)__P = __A;
185}
186
187/* Stores the lower DPFP value. */
188extern __inline void
189 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _mm_store_sd(double *__P, __m128d __A) {
191 *__P = ((__v2df)__A)[0];
192}
193
194extern __inline double
195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_cvtsd_f64(__m128d __A) {
197 return ((__v2df)__A)[0];
198}
199
200extern __inline void
201 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_storel_pd(double *__P, __m128d __A) {
203 _mm_store_sd(__P, __A);
204}
205
206/* Stores the upper DPFP value. */
207extern __inline void
208 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
209 _mm_storeh_pd(double *__P, __m128d __A) {
210 *__P = ((__v2df)__A)[1];
211}
212/* Store the lower DPFP value across two words.
213 The address must be 16-byte aligned. */
214extern __inline void
215 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm_store1_pd(double *__P, __m128d __A) {
217 _mm_store_pd(__P, vec_splat(__A, 0));
218}
219
220extern __inline void
221 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 _mm_store_pd1(double *__P, __m128d __A) {
223 _mm_store1_pd(__P, __A);
224}
225
226/* Store two DPFP values in reverse order. The address must be aligned. */
227extern __inline void
228 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 _mm_storer_pd(double *__P, __m128d __A) {
230 _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
231}
232
233/* Intel intrinsic. */
234extern __inline long long
235 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
236 _mm_cvtsi128_si64(__m128i __A) {
237 return ((__v2di)__A)[0];
238}
239
240/* Microsoft intrinsic. */
241extern __inline long long
242 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243 _mm_cvtsi128_si64x(__m128i __A) {
244 return ((__v2di)__A)[0];
245}
246
247extern __inline __m128d
248 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 _mm_add_pd(__m128d __A, __m128d __B) {
250 return (__m128d)((__v2df)__A + (__v2df)__B);
251}
252
253/* Add the lower double-precision (64-bit) floating-point element in
254 a and b, store the result in the lower element of dst, and copy
255 the upper element from a to the upper element of dst. */
256extern __inline __m128d
257 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258 _mm_add_sd(__m128d __A, __m128d __B) {
259 __A[0] = __A[0] + __B[0];
260 return (__A);
261}
262
263extern __inline __m128d
264 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_sub_pd(__m128d __A, __m128d __B) {
266 return (__m128d)((__v2df)__A - (__v2df)__B);
267}
268
269extern __inline __m128d
270 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
271 _mm_sub_sd(__m128d __A, __m128d __B) {
272 __A[0] = __A[0] - __B[0];
273 return (__A);
274}
275
276extern __inline __m128d
277 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm_mul_pd(__m128d __A, __m128d __B) {
279 return (__m128d)((__v2df)__A * (__v2df)__B);
280}
281
282extern __inline __m128d
283 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284 _mm_mul_sd(__m128d __A, __m128d __B) {
285 __A[0] = __A[0] * __B[0];
286 return (__A);
287}
288
289extern __inline __m128d
290 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 _mm_div_pd(__m128d __A, __m128d __B) {
292 return (__m128d)((__v2df)__A / (__v2df)__B);
293}
294
295extern __inline __m128d
296 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
297 _mm_div_sd(__m128d __A, __m128d __B) {
298 __A[0] = __A[0] / __B[0];
299 return (__A);
300}
301
302extern __inline __m128d
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _mm_sqrt_pd(__m128d __A) {
305 return (vec_sqrt(__A));
306}
307
308/* Return pair {sqrt (B[0]), A[1]}. */
309extern __inline __m128d
310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_sqrt_sd(__m128d __A, __m128d __B) {
312 __v2df __c;
313 __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
314 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
315}
316
317extern __inline __m128d
318 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm_min_pd(__m128d __A, __m128d __B) {
320 return (vec_min(__A, __B));
321}
322
323extern __inline __m128d
324 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
325 _mm_min_sd(__m128d __A, __m128d __B) {
326 __v2df __a, __b, __c;
327 __a = vec_splats(__A[0]);
328 __b = vec_splats(__B[0]);
329 __c = vec_min(__a, __b);
330 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
331}
332
333extern __inline __m128d
334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_max_pd(__m128d __A, __m128d __B) {
336 return (vec_max(__A, __B));
337}
338
339extern __inline __m128d
340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_max_sd(__m128d __A, __m128d __B) {
342 __v2df __a, __b, __c;
343 __a = vec_splats(__A[0]);
344 __b = vec_splats(__B[0]);
345 __c = vec_max(__a, __b);
346 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
347}
348
349extern __inline __m128d
350 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_cmpeq_pd(__m128d __A, __m128d __B) {
352 return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
353}
354
355extern __inline __m128d
356 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_cmplt_pd(__m128d __A, __m128d __B) {
358 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
359}
360
361extern __inline __m128d
362 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_cmple_pd(__m128d __A, __m128d __B) {
364 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
365}
366
367extern __inline __m128d
368 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmpgt_pd(__m128d __A, __m128d __B) {
370 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
371}
372
373extern __inline __m128d
374 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm_cmpge_pd(__m128d __A, __m128d __B) {
376 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
377}
378
379extern __inline __m128d
380 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _mm_cmpneq_pd(__m128d __A, __m128d __B) {
382 __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
383 return ((__m128d)vec_nor(__temp, __temp));
384}
385
386extern __inline __m128d
387 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
389 return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
390}
391
392extern __inline __m128d
393 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 _mm_cmpnle_pd(__m128d __A, __m128d __B) {
395 return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
396}
397
398extern __inline __m128d
399 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_cmpngt_pd(__m128d __A, __m128d __B) {
401 return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
402}
403
404extern __inline __m128d
405 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406 _mm_cmpnge_pd(__m128d __A, __m128d __B) {
407 return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
408}
409
410extern __inline __m128d
411 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
412 _mm_cmpord_pd(__m128d __A, __m128d __B) {
413 __v2du __c, __d;
414 /* Compare against self will return false (0's) if NAN. */
415 __c = (__v2du)vec_cmpeq(__A, __A);
416 __d = (__v2du)vec_cmpeq(__B, __B);
417 /* A != NAN and B != NAN. */
418 return ((__m128d)vec_and(__c, __d));
419}
420
421extern __inline __m128d
422 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423 _mm_cmpunord_pd(__m128d __A, __m128d __B) {
424#if _ARCH_PWR8
425 __v2du __c, __d;
426 /* Compare against self will return false (0's) if NAN. */
427 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
428 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
429 /* A == NAN OR B == NAN converts too:
430 NOT(A != NAN) OR NOT(B != NAN). */
431 __c = vec_nor(__c, __c);
432 return ((__m128d)vec_orc(__c, __d));
433#else
434 __v2du __c, __d;
435 /* Compare against self will return false (0's) if NAN. */
436 __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
437 __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
438 /* Convert the true ('1's) is NAN. */
439 __c = vec_nor(__c, __c);
440 __d = vec_nor(__d, __d);
441 return ((__m128d)vec_or(__c, __d));
442#endif
443}
444
445extern __inline __m128d
446 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447 _mm_cmpeq_sd(__m128d __A, __m128d __B) {
448 __v2df __a, __b, __c;
449 /* PowerISA VSX does not allow partial (for just lower double)
450 results. So to insure we don't generate spurious exceptions
451 (from the upper double values) we splat the lower double
452 before we do the operation. */
453 __a = vec_splats(__A[0]);
454 __b = vec_splats(__B[0]);
455 __c = (__v2df)vec_cmpeq(__a, __b);
456 /* Then we merge the lower double result with the original upper
457 double from __A. */
458 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
459}
460
461extern __inline __m128d
462 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463 _mm_cmplt_sd(__m128d __A, __m128d __B) {
464 __v2df __a, __b, __c;
465 __a = vec_splats(__A[0]);
466 __b = vec_splats(__B[0]);
467 __c = (__v2df)vec_cmplt(__a, __b);
468 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
469}
470
471extern __inline __m128d
472 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473 _mm_cmple_sd(__m128d __A, __m128d __B) {
474 __v2df __a, __b, __c;
475 __a = vec_splats(__A[0]);
476 __b = vec_splats(__B[0]);
477 __c = (__v2df)vec_cmple(__a, __b);
478 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
479}
480
481extern __inline __m128d
482 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm_cmpgt_sd(__m128d __A, __m128d __B) {
484 __v2df __a, __b, __c;
485 __a = vec_splats(__A[0]);
486 __b = vec_splats(__B[0]);
487 __c = (__v2df)vec_cmpgt(__a, __b);
488 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
489}
490
491extern __inline __m128d
492 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
493 _mm_cmpge_sd(__m128d __A, __m128d __B) {
494 __v2df __a, __b, __c;
495 __a = vec_splats(__A[0]);
496 __b = vec_splats(__B[0]);
497 __c = (__v2df)vec_cmpge(__a, __b);
498 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
499}
500
501extern __inline __m128d
502 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503 _mm_cmpneq_sd(__m128d __A, __m128d __B) {
504 __v2df __a, __b, __c;
505 __a = vec_splats(__A[0]);
506 __b = vec_splats(__B[0]);
507 __c = (__v2df)vec_cmpeq(__a, __b);
508 __c = vec_nor(__c, __c);
509 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
510}
511
512extern __inline __m128d
513 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
515 __v2df __a, __b, __c;
516 __a = vec_splats(__A[0]);
517 __b = vec_splats(__B[0]);
518 /* Not less than is just greater than or equal. */
519 __c = (__v2df)vec_cmpge(__a, __b);
520 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
521}
522
523extern __inline __m128d
524 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
525 _mm_cmpnle_sd(__m128d __A, __m128d __B) {
526 __v2df __a, __b, __c;
527 __a = vec_splats(__A[0]);
528 __b = vec_splats(__B[0]);
529 /* Not less than or equal is just greater than. */
530 __c = (__v2df)vec_cmpge(__a, __b);
531 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
532}
533
534extern __inline __m128d
535 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
536 _mm_cmpngt_sd(__m128d __A, __m128d __B) {
537 __v2df __a, __b, __c;
538 __a = vec_splats(__A[0]);
539 __b = vec_splats(__B[0]);
540 /* Not greater than is just less than or equal. */
541 __c = (__v2df)vec_cmple(__a, __b);
542 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
543}
544
545extern __inline __m128d
546 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547 _mm_cmpnge_sd(__m128d __A, __m128d __B) {
548 __v2df __a, __b, __c;
549 __a = vec_splats(__A[0]);
550 __b = vec_splats(__B[0]);
551 /* Not greater than or equal is just less than. */
552 __c = (__v2df)vec_cmplt(__a, __b);
553 return (__m128d)_mm_setr_pd(__c[0], __A[1]);
554}
555
556extern __inline __m128d
557 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_cmpord_sd(__m128d __A, __m128d __B) {
559 __v2df __r;
560 __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
561 return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
562}
563
564extern __inline __m128d
565 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
566 _mm_cmpunord_sd(__m128d __A, __m128d __B) {
567 __v2df __r;
568 __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
569 return (__m128d)_mm_setr_pd(__r[0], __A[1]);
570}
571
572/* FIXME
573 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
574 exactly the same because GCC for PowerPC only generates unordered
575 compares (scalar and vector).
576 Technically __mm_comieq_sp et all should be using the ordered
577 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
578 be OK. */
579extern __inline int
580 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581 _mm_comieq_sd(__m128d __A, __m128d __B) {
582 return (__A[0] == __B[0]);
583}
584
585extern __inline int
586 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_comilt_sd(__m128d __A, __m128d __B) {
588 return (__A[0] < __B[0]);
589}
590
591extern __inline int
592 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm_comile_sd(__m128d __A, __m128d __B) {
594 return (__A[0] <= __B[0]);
595}
596
597extern __inline int
598 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 _mm_comigt_sd(__m128d __A, __m128d __B) {
600 return (__A[0] > __B[0]);
601}
602
603extern __inline int
604 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 _mm_comige_sd(__m128d __A, __m128d __B) {
606 return (__A[0] >= __B[0]);
607}
608
609extern __inline int
610 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611 _mm_comineq_sd(__m128d __A, __m128d __B) {
612 return (__A[0] != __B[0]);
613}
614
615extern __inline int
616 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617 _mm_ucomieq_sd(__m128d __A, __m128d __B) {
618 return (__A[0] == __B[0]);
619}
620
621extern __inline int
622 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 _mm_ucomilt_sd(__m128d __A, __m128d __B) {
624 return (__A[0] < __B[0]);
625}
626
627extern __inline int
628 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 _mm_ucomile_sd(__m128d __A, __m128d __B) {
630 return (__A[0] <= __B[0]);
631}
632
633extern __inline int
634 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 _mm_ucomigt_sd(__m128d __A, __m128d __B) {
636 return (__A[0] > __B[0]);
637}
638
639extern __inline int
640 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 _mm_ucomige_sd(__m128d __A, __m128d __B) {
642 return (__A[0] >= __B[0]);
643}
644
645extern __inline int
646 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647 _mm_ucomineq_sd(__m128d __A, __m128d __B) {
648 return (__A[0] != __B[0]);
649}
650
651/* Create a vector of Qi, where i is the element number. */
652extern __inline __m128i
653 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _mm_set_epi64x(long long __q1, long long __q0) {
655 return __extension__(__m128i)(__v2di){__q0, __q1};
656}
657
658extern __inline __m128i
659 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_set_epi64(__m64 __q1, __m64 __q0) {
661 return _mm_set_epi64x((long long)__q1, (long long)__q0);
662}
663
664extern __inline __m128i
665 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
667 return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
668}
669
670extern __inline __m128i
671 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
672 _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
673 short __q2, short __q1, short __q0) {
674 return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
675 __q4, __q5, __q6, __q7};
676}
677
678extern __inline __m128i
679 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
681 char __q10, char __q09, char __q08, char __q07, char __q06,
682 char __q05, char __q04, char __q03, char __q02, char __q01,
683 char __q00) {
684 return __extension__(__m128i)(__v16qi){
685 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
686 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
687}
688
689/* Set all of the elements of the vector to A. */
690extern __inline __m128i
691 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _mm_set1_epi64x(long long __A) {
693 return _mm_set_epi64x(__A, __A);
694}
695
696extern __inline __m128i
697 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_set1_epi64(__m64 __A) {
699 return _mm_set_epi64(__A, __A);
700}
701
702extern __inline __m128i
703 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
704 _mm_set1_epi32(int __A) {
705 return _mm_set_epi32(__A, __A, __A, __A);
706}
707
708extern __inline __m128i
709 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
710 _mm_set1_epi16(short __A) {
711 return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
712}
713
714extern __inline __m128i
715 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 _mm_set1_epi8(char __A) {
717 return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
718 __A, __A, __A, __A, __A);
719}
720
721/* Create a vector of Qi, where i is the element number.
722 The parameter order is reversed from the _mm_set_epi* functions. */
723extern __inline __m128i
724 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 _mm_setr_epi64(__m64 __q0, __m64 __q1) {
726 return _mm_set_epi64(__q1, __q0);
727}
728
729extern __inline __m128i
730 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
731 _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
732 return _mm_set_epi32(__q3, __q2, __q1, __q0);
733}
734
735extern __inline __m128i
736 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
738 short __q5, short __q6, short __q7) {
739 return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
740}
741
742extern __inline __m128i
743 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
744 _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
745 char __q05, char __q06, char __q07, char __q08, char __q09,
746 char __q10, char __q11, char __q12, char __q13, char __q14,
747 char __q15) {
748 return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
749 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
750}
751
752/* Create a vector with element 0 as *P and the rest zero. */
753extern __inline __m128i
754 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 _mm_load_si128(__m128i const *__P) {
756 return *__P;
757}
758
759extern __inline __m128i
760 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
761 _mm_loadu_si128(__m128i_u const *__P) {
762 return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
763}
764
765extern __inline __m128i
766 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
767 _mm_loadl_epi64(__m128i_u const *__P) {
768 return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
769}
770
771extern __inline void
772 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773 _mm_store_si128(__m128i *__P, __m128i __B) {
774 vec_st((__v16qu)__B, 0, (__v16qu *)__P);
775}
776
777extern __inline void
778 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779 _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
780 *__P = __B;
781}
782
783extern __inline void
784 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
786 *(long long *)__P = ((__v2di)__B)[0];
787}
788
789extern __inline __m64
790 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm_movepi64_pi64(__m128i_u __B) {
792 return (__m64)((__v2di)__B)[0];
793}
794
795extern __inline __m128i
796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_movpi64_epi64(__m64 __A) {
798 return _mm_set_epi64((__m64)0LL, __A);
799}
800
801extern __inline __m128i
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_move_epi64(__m128i __A) {
804 return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
805}
806
807/* Create an undefined vector. */
808extern __inline __m128i
809 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
810 _mm_undefined_si128(void) {
811 __m128i __Y = __Y;
812 return __Y;
813}
814
815/* Create a vector of zeros. */
816extern __inline __m128i
817 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_setzero_si128(void) {
819 return __extension__(__m128i)(__v4si){0, 0, 0, 0};
820}
821
822#ifdef _ARCH_PWR8
823extern __inline __m128d
824 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_cvtepi32_pd(__m128i __A) {
826 __v2di __val;
827 /* For LE need to generate Vector Unpack Low Signed Word.
828 Which is generated from unpackh. */
829 __val = (__v2di)vec_unpackh((__v4si)__A);
830
831 return (__m128d)vec_ctf(__val, 0);
832}
833#endif
834
835extern __inline __m128
836 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
837 _mm_cvtepi32_ps(__m128i __A) {
838 return ((__m128)vec_ctf((__v4si)__A, 0));
839}
840
841extern __inline __m128i
842 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
843 _mm_cvtpd_epi32(__m128d __A) {
844 __v2df __rounded = vec_rint(__A);
845 __v4si __result, __temp;
846 const __v4si __vzero = {0, 0, 0, 0};
847
848 /* VSX Vector truncate Double-Precision to integer and Convert to
849 Signed Integer Word format with Saturate. */
850 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
851
852#ifdef _ARCH_PWR8
853#ifdef __LITTLE_ENDIAN__
854 __temp = vec_mergeo(__temp, __temp);
855#else
856 __temp = vec_mergee(__temp, __temp);
857#endif
858 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
859 (__vector long long)__vzero);
860#else
861 {
862 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
863 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
864 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
865 }
866#endif
867 return (__m128i)__result;
868}
869
870extern __inline __m64
871 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
872 _mm_cvtpd_pi32(__m128d __A) {
873 __m128i __result = _mm_cvtpd_epi32(__A);
874
875 return (__m64)__result[0];
876}
877
878extern __inline __m128
879 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
880 _mm_cvtpd_ps(__m128d __A) {
881 __v4sf __result;
882 __v4si __temp;
883 const __v4si __vzero = {0, 0, 0, 0};
884
885 __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
886
887#ifdef _ARCH_PWR8
888#ifdef __LITTLE_ENDIAN__
889 __temp = vec_mergeo(__temp, __temp);
890#else
891 __temp = vec_mergee(__temp, __temp);
892#endif
893 __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
894 (__vector long long)__vzero);
895#else
896 {
897 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
898 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
899 __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
900 }
901#endif
902 return ((__m128)__result);
903}
904
905extern __inline __m128i
906 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
907 _mm_cvttpd_epi32(__m128d __A) {
908 __v4si __result;
909 __v4si __temp;
910 const __v4si __vzero = {0, 0, 0, 0};
911
912 /* VSX Vector truncate Double-Precision to integer and Convert to
913 Signed Integer Word format with Saturate. */
914 __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
915
916#ifdef _ARCH_PWR8
917#ifdef __LITTLE_ENDIAN__
918 __temp = vec_mergeo(__temp, __temp);
919#else
920 __temp = vec_mergee(__temp, __temp);
921#endif
922 __result = (__v4si)vec_vpkudum((__vector long long)__temp,
923 (__vector long long)__vzero);
924#else
925 {
926 const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
927 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
928 __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
929 }
930#endif
931
932 return ((__m128i)__result);
933}
934
935extern __inline __m64
936 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937 _mm_cvttpd_pi32(__m128d __A) {
938 __m128i __result = _mm_cvttpd_epi32(__A);
939
940 return (__m64)__result[0];
941}
942
943extern __inline int
944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm_cvtsi128_si32(__m128i __A) {
946 return ((__v4si)__A)[0];
947}
948
949#ifdef _ARCH_PWR8
950extern __inline __m128d
951 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952 _mm_cvtpi32_pd(__m64 __A) {
953 __v4si __temp;
954 __v2di __tmp2;
955 __v4f __result;
956
957 __temp = (__v4si)vec_splats(__A);
958 __tmp2 = (__v2di)vec_unpackl(__temp);
959 __result = vec_ctf((__vector signed long long)__tmp2, 0);
960 return (__m128d)__result;
961}
962#endif
963
964extern __inline __m128i
965 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 _mm_cvtps_epi32(__m128 __A) {
967 __v4sf __rounded;
968 __v4si __result;
969
970 __rounded = vec_rint((__v4sf)__A);
971 __result = vec_cts(__rounded, 0);
972 return (__m128i)__result;
973}
974
975extern __inline __m128i
976 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm_cvttps_epi32(__m128 __A) {
978 __v4si __result;
979
980 __result = vec_cts((__v4sf)__A, 0);
981 return (__m128i)__result;
982}
983
984extern __inline __m128d
985 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986 _mm_cvtps_pd(__m128 __A) {
987 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988#ifdef vec_doubleh
989 return (__m128d)vec_doubleh((__v4sf)__A);
990#else
991 /* Otherwise the compiler is not current and so need to generate the
992 equivalent code. */
993 __v4sf __a = (__v4sf)__A;
994 __v4sf __temp;
995 __v2df __result;
996#ifdef __LITTLE_ENDIAN__
997 /* The input float values are in elements {[0], [1]} but the convert
998 instruction needs them in elements {[1], [3]}, So we use two
999 shift left double vector word immediates to get the elements
1000 lined up. */
1001 __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002 __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003#else
1004 /* The input float values are in elements {[0], [1]} but the convert
1005 instruction needs them in elements {[0], [2]}, So we use two
1006 shift left double vector word immediates to get the elements
1007 lined up. */
1008 __temp = vec_vmrghw(__a, __a);
1009#endif
1010 __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011 return (__m128d)__result;
1012#endif
1013}
1014
1015extern __inline int
1016 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_cvtsd_si32(__m128d __A) {
1018 __v2df __rounded = vec_rint((__v2df)__A);
1019 int __result = ((__v2df)__rounded)[0];
1020
1021 return __result;
1022}
1023/* Intel intrinsic. */
1024extern __inline long long
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvtsd_si64(__m128d __A) {
1027 __v2df __rounded = vec_rint((__v2df)__A);
1028 long long __result = ((__v2df)__rounded)[0];
1029
1030 return __result;
1031}
1032
1033/* Microsoft intrinsic. */
1034extern __inline long long
1035 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm_cvtsd_si64x(__m128d __A) {
1037 return _mm_cvtsd_si64((__v2df)__A);
1038}
1039
1040extern __inline int
1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm_cvttsd_si32(__m128d __A) {
1043 int __result = ((__v2df)__A)[0];
1044
1045 return __result;
1046}
1047
1048/* Intel intrinsic. */
1049extern __inline long long
1050 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051 _mm_cvttsd_si64(__m128d __A) {
1052 long long __result = ((__v2df)__A)[0];
1053
1054 return __result;
1055}
1056
1057/* Microsoft intrinsic. */
1058extern __inline long long
1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvttsd_si64x(__m128d __A) {
1061 return _mm_cvttsd_si64(__A);
1062}
1063
1064extern __inline __m128
1065 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066 _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067 __v4sf __result = (__v4sf)__A;
1068
1069#ifdef __LITTLE_ENDIAN__
1070 __v4sf __temp_s;
1071 /* Copy double element[0] to element [1] for conversion. */
1072 __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073
1074 /* Pre-rotate __A left 3 (logically right 1) elements. */
1075 __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076 /* Convert double to single float scalar in a vector. */
1077 __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078 /* Shift the resulting scalar into vector element [0]. */
1079 __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080#else
1081 __result[0] = ((__v2df)__B)[0];
1082#endif
1083 return (__m128)__result;
1084}
1085
1086extern __inline __m128d
1087 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088 _mm_cvtsi32_sd(__m128d __A, int __B) {
1089 __v2df __result = (__v2df)__A;
1090 double __db = __B;
1091 __result[0] = __db;
1092 return (__m128d)__result;
1093}
1094
1095/* Intel intrinsic. */
1096extern __inline __m128d
1097 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099 __v2df __result = (__v2df)__A;
1100 double __db = __B;
1101 __result[0] = __db;
1102 return (__m128d)__result;
1103}
1104
1105/* Microsoft intrinsic. */
1106extern __inline __m128d
1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109 return _mm_cvtsi64_sd(__A, __B);
1110}
1111
1112extern __inline __m128d
1113 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114 _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115#ifdef __LITTLE_ENDIAN__
1116 /* Use splat to move element [0] into position for the convert. */
1117 __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118 __v2df __res;
1119 /* Convert single float scalar to double in a vector. */
1120 __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121 return (__m128d)vec_mergel(__res, (__v2df)__A);
1122#else
1123 __v2df __res = (__v2df)__A;
1124 __res[0] = ((__v4sf)__B)[0];
1125 return (__m128d)__res;
1126#endif
1127}
1128
1129extern __inline __m128d
1130 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132 __vector double __result;
1133 const int __litmsk = __mask & 0x3;
1134
1135 if (__litmsk == 0)
1136 __result = vec_mergeh(__A, __B);
1137#if __GNUC__ < 6
1138 else if (__litmsk == 1)
1139 __result = vec_xxpermdi(__B, __A, 2);
1140 else if (__litmsk == 2)
1141 __result = vec_xxpermdi(__B, __A, 1);
1142#else
1143 else if (__litmsk == 1)
1144 __result = vec_xxpermdi(__A, __B, 2);
1145 else if (__litmsk == 2)
1146 __result = vec_xxpermdi(__A, __B, 1);
1147#endif
1148 else
1149 __result = vec_mergel(__A, __B);
1150
1151 return __result;
1152}
1153
1154extern __inline __m128d
1155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157 return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158}
1159
1160extern __inline __m128d
1161 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162 _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163 return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164}
1165
1166extern __inline __m128d
1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm_loadh_pd(__m128d __A, double const *__B) {
1169 __v2df __result = (__v2df)__A;
1170 __result[1] = *__B;
1171 return (__m128d)__result;
1172}
1173
1174extern __inline __m128d
1175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176 _mm_loadl_pd(__m128d __A, double const *__B) {
1177 __v2df __result = (__v2df)__A;
1178 __result[0] = *__B;
1179 return (__m128d)__result;
1180}
1181
1182#ifdef _ARCH_PWR8
1183/* Intrinsic functions that require PowerISA 2.07 minimum. */
1184
1185/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1186extern __inline int
1187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_movemask_pd(__m128d __A) {
1189#ifdef _ARCH_PWR10
1190 return vec_extractm((__v2du)__A);
1191#else
1192 __vector unsigned long long __result;
1193 static const __vector unsigned int __perm_mask = {
1194#ifdef __LITTLE_ENDIAN__
1195 0x80800040, 0x80808080, 0x80808080, 0x80808080
1196#else
1197 0x80808080, 0x80808080, 0x80808080, 0x80804000
1198#endif
1199 };
1200
1201 __result = ((__vector unsigned long long)vec_vbpermq(
1202 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203
1204#ifdef __LITTLE_ENDIAN__
1205 return __result[1];
1206#else
1207 return __result[0];
1208#endif
1209#endif /* !_ARCH_PWR10 */
1210}
1211#endif /* _ARCH_PWR8 */
1212
1213extern __inline __m128i
1214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _mm_packs_epi16(__m128i __A, __m128i __B) {
1216 return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217}
1218
1219extern __inline __m128i
1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm_packs_epi32(__m128i __A, __m128i __B) {
1222 return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223}
1224
1225extern __inline __m128i
1226 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227 _mm_packus_epi16(__m128i __A, __m128i __B) {
1228 return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229}
1230
1231extern __inline __m128i
1232 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233 _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234 return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235}
1236
1237extern __inline __m128i
1238 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239 _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240 return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241}
1242
1243extern __inline __m128i
1244 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246 return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247}
1248
1249extern __inline __m128i
1250 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252 return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253}
1254
1255extern __inline __m128i
1256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258 return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259}
1260
1261extern __inline __m128i
1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264 return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265}
1266
1267extern __inline __m128i
1268 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270 return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271}
1272
1273extern __inline __m128i
1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276 return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277}
1278
1279extern __inline __m128i
1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_add_epi8(__m128i __A, __m128i __B) {
1282 return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283}
1284
1285extern __inline __m128i
1286 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287 _mm_add_epi16(__m128i __A, __m128i __B) {
1288 return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289}
1290
1291extern __inline __m128i
1292 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_add_epi32(__m128i __A, __m128i __B) {
1294 return (__m128i)((__v4su)__A + (__v4su)__B);
1295}
1296
1297extern __inline __m128i
1298 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299 _mm_add_epi64(__m128i __A, __m128i __B) {
1300 return (__m128i)((__v2du)__A + (__v2du)__B);
1301}
1302
1303extern __inline __m128i
1304 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 _mm_adds_epi8(__m128i __A, __m128i __B) {
1306 return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307}
1308
1309extern __inline __m128i
1310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm_adds_epi16(__m128i __A, __m128i __B) {
1312 return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313}
1314
1315extern __inline __m128i
1316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_adds_epu8(__m128i __A, __m128i __B) {
1318 return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319}
1320
1321extern __inline __m128i
1322 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm_adds_epu16(__m128i __A, __m128i __B) {
1324 return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325}
1326
1327extern __inline __m128i
1328 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_sub_epi8(__m128i __A, __m128i __B) {
1330 return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331}
1332
1333extern __inline __m128i
1334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm_sub_epi16(__m128i __A, __m128i __B) {
1336 return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337}
1338
1339extern __inline __m128i
1340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_sub_epi32(__m128i __A, __m128i __B) {
1342 return (__m128i)((__v4su)__A - (__v4su)__B);
1343}
1344
1345extern __inline __m128i
1346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_sub_epi64(__m128i __A, __m128i __B) {
1348 return (__m128i)((__v2du)__A - (__v2du)__B);
1349}
1350
1351extern __inline __m128i
1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_subs_epi8(__m128i __A, __m128i __B) {
1354 return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355}
1356
1357extern __inline __m128i
1358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 _mm_subs_epi16(__m128i __A, __m128i __B) {
1360 return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361}
1362
1363extern __inline __m128i
1364 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365 _mm_subs_epu8(__m128i __A, __m128i __B) {
1366 return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367}
1368
1369extern __inline __m128i
1370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371 _mm_subs_epu16(__m128i __A, __m128i __B) {
1372 return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373}
1374
1375extern __inline __m128i
1376 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377 _mm_madd_epi16(__m128i __A, __m128i __B) {
1378 __vector signed int __zero = {0, 0, 0, 0};
1379
1380 return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381}
1382
1383extern __inline __m128i
1384 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386 __vector signed int __w0, __w1;
1387
1388 __vector unsigned char __xform1 = {
1389#ifdef __LITTLE_ENDIAN__
1390 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392#else
1393 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395#endif
1396 };
1397
1398 __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399 __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400 return (__m128i)vec_perm(__w0, __w1, __xform1);
1401}
1402
1403extern __inline __m128i
1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406 return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407}
1408
1409extern __inline __m64
1410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411 _mm_mul_su32(__m64 __A, __m64 __B) {
1412 unsigned int __a = __A;
1413 unsigned int __b = __B;
1414
1415 return ((__m64)__a * (__m64)__b);
1416}
1417
1418#ifdef _ARCH_PWR8
1419extern __inline __m128i
1420 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_mul_epu32(__m128i __A, __m128i __B) {
1422#if __GNUC__ < 8
1423 __v2du __result;
1424
1425#ifdef __LITTLE_ENDIAN__
1426 /* VMX Vector Multiply Odd Unsigned Word. */
1427 __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428#else
1429 /* VMX Vector Multiply Even Unsigned Word. */
1430 __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431#endif
1432 return (__m128i)__result;
1433#else
1434 return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435#endif
1436}
1437#endif
1438
1439extern __inline __m128i
1440 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441 _mm_slli_epi16(__m128i __A, int __B) {
1442 __v8hu __lshift;
1443 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444
1445 if (__B >= 0 && __B < 16) {
1446 if (__builtin_constant_p(__B))
1447 __lshift = (__v8hu)vec_splat_s16(__B);
1448 else
1449 __lshift = vec_splats((unsigned short)__B);
1450
1451 __result = vec_sl((__v8hi)__A, __lshift);
1452 }
1453
1454 return (__m128i)__result;
1455}
1456
1457extern __inline __m128i
1458 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459 _mm_slli_epi32(__m128i __A, int __B) {
1460 __v4su __lshift;
1461 __v4si __result = {0, 0, 0, 0};
1462
1463 if (__B >= 0 && __B < 32) {
1464 if (__builtin_constant_p(__B) && __B < 16)
1465 __lshift = (__v4su)vec_splat_s32(__B);
1466 else
1467 __lshift = vec_splats((unsigned int)__B);
1468
1469 __result = vec_sl((__v4si)__A, __lshift);
1470 }
1471
1472 return (__m128i)__result;
1473}
1474
1475#ifdef _ARCH_PWR8
1476extern __inline __m128i
1477 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478 _mm_slli_epi64(__m128i __A, int __B) {
1479 __v2du __lshift;
1480 __v2di __result = {0, 0};
1481
1482 if (__B >= 0 && __B < 64) {
1483 if (__builtin_constant_p(__B) && __B < 16)
1484 __lshift = (__v2du)vec_splat_s32(__B);
1485 else
1486 __lshift = (__v2du)vec_splats((unsigned int)__B);
1487
1488 __result = vec_sl((__v2di)__A, __lshift);
1489 }
1490
1491 return (__m128i)__result;
1492}
1493#endif
1494
1495extern __inline __m128i
1496 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497 _mm_srai_epi16(__m128i __A, int __B) {
1498 __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499 __v8hi __result;
1500
1501 if (__B < 16) {
1502 if (__builtin_constant_p(__B))
1503 __rshift = (__v8hu)vec_splat_s16(__B);
1504 else
1505 __rshift = vec_splats((unsigned short)__B);
1506 }
1507 __result = vec_sra((__v8hi)__A, __rshift);
1508
1509 return (__m128i)__result;
1510}
1511
1512extern __inline __m128i
1513 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514 _mm_srai_epi32(__m128i __A, int __B) {
1515 __v4su __rshift = {31, 31, 31, 31};
1516 __v4si __result;
1517
1518 if (__B < 32) {
1519 if (__builtin_constant_p(__B)) {
1520 if (__B < 16)
1521 __rshift = (__v4su)vec_splat_s32(__B);
1522 else
1523 __rshift = (__v4su)vec_splats((unsigned int)__B);
1524 } else
1525 __rshift = vec_splats((unsigned int)__B);
1526 }
1527 __result = vec_sra((__v4si)__A, __rshift);
1528
1529 return (__m128i)__result;
1530}
1531
1532extern __inline __m128i
1533 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534 _mm_bslli_si128(__m128i __A, const int __N) {
1535 __v16qu __result;
1536 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537
1538 if (__N < 16)
1539 __result = vec_sld((__v16qu)__A, __zeros, __N);
1540 else
1541 __result = __zeros;
1542
1543 return (__m128i)__result;
1544}
1545
1546extern __inline __m128i
1547 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548 _mm_bsrli_si128(__m128i __A, const int __N) {
1549 __v16qu __result;
1550 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551
1552 if (__N < 16)
1553#ifdef __LITTLE_ENDIAN__
1554 if (__builtin_constant_p(__N))
1555 /* Would like to use Vector Shift Left Double by Octet
1556 Immediate here to use the immediate form and avoid
1557 load of __N * 8 value into a separate VR. */
1558 __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559 else
1560#endif
1561 {
1562 __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563#ifdef __LITTLE_ENDIAN__
1564 __result = vec_sro((__v16qu)__A, __shift);
1565#else
1566 __result = vec_slo((__v16qu)__A, __shift);
1567#endif
1568 }
1569 else
1570 __result = __zeros;
1571
1572 return (__m128i)__result;
1573}
1574
1575extern __inline __m128i
1576 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 _mm_srli_si128(__m128i __A, const int __N) {
1578 return _mm_bsrli_si128(__A, __N);
1579}
1580
1581extern __inline __m128i
1582 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583 _mm_slli_si128(__m128i __A, const int _imm5) {
1584 __v16qu __result;
1585 const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586
1587 if (_imm5 < 16)
1588#ifdef __LITTLE_ENDIAN__
1589 __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590#else
1591 __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592#endif
1593 else
1594 __result = __zeros;
1595
1596 return (__m128i)__result;
1597}
1598
1599extern __inline __m128i
1600 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601
1602 _mm_srli_epi16(__m128i __A, int __B) {
1603 __v8hu __rshift;
1604 __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605
1606 if (__B < 16) {
1607 if (__builtin_constant_p(__B))
1608 __rshift = (__v8hu)vec_splat_s16(__B);
1609 else
1610 __rshift = vec_splats((unsigned short)__B);
1611
1612 __result = vec_sr((__v8hi)__A, __rshift);
1613 }
1614
1615 return (__m128i)__result;
1616}
1617
1618extern __inline __m128i
1619 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620 _mm_srli_epi32(__m128i __A, int __B) {
1621 __v4su __rshift;
1622 __v4si __result = {0, 0, 0, 0};
1623
1624 if (__B < 32) {
1625 if (__builtin_constant_p(__B)) {
1626 if (__B < 16)
1627 __rshift = (__v4su)vec_splat_s32(__B);
1628 else
1629 __rshift = (__v4su)vec_splats((unsigned int)__B);
1630 } else
1631 __rshift = vec_splats((unsigned int)__B);
1632
1633 __result = vec_sr((__v4si)__A, __rshift);
1634 }
1635
1636 return (__m128i)__result;
1637}
1638
1639#ifdef _ARCH_PWR8
1640extern __inline __m128i
1641 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642 _mm_srli_epi64(__m128i __A, int __B) {
1643 __v2du __rshift;
1644 __v2di __result = {0, 0};
1645
1646 if (__B < 64) {
1647 if (__builtin_constant_p(__B)) {
1648 if (__B < 16)
1649 __rshift = (__v2du)vec_splat_s32(__B);
1650 else
1651 __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652 } else
1653 __rshift = (__v2du)vec_splats((unsigned int)__B);
1654
1655 __result = vec_sr((__v2di)__A, __rshift);
1656 }
1657
1658 return (__m128i)__result;
1659}
1660#endif
1661
1662extern __inline __m128i
1663 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664 _mm_sll_epi16(__m128i __A, __m128i __B) {
1665 __v8hu __lshift;
1666 __vector __bool short __shmask;
1667 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668 __v8hu __result;
1669
1670#ifdef __LITTLE_ENDIAN__
1671 __lshift = vec_splat((__v8hu)__B, 0);
1672#else
1673 __lshift = vec_splat((__v8hu)__B, 3);
1674#endif
1675 __shmask = vec_cmple(__lshift, __shmax);
1676 __result = vec_sl((__v8hu)__A, __lshift);
1677 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678
1679 return (__m128i)__result;
1680}
1681
1682extern __inline __m128i
1683 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684 _mm_sll_epi32(__m128i __A, __m128i __B) {
1685 __v4su __lshift;
1686 __vector __bool int __shmask;
1687 const __v4su __shmax = {32, 32, 32, 32};
1688 __v4su __result;
1689#ifdef __LITTLE_ENDIAN__
1690 __lshift = vec_splat((__v4su)__B, 0);
1691#else
1692 __lshift = vec_splat((__v4su)__B, 1);
1693#endif
1694 __shmask = vec_cmplt(__lshift, __shmax);
1695 __result = vec_sl((__v4su)__A, __lshift);
1696 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697
1698 return (__m128i)__result;
1699}
1700
1701#ifdef _ARCH_PWR8
1702extern __inline __m128i
1703 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704 _mm_sll_epi64(__m128i __A, __m128i __B) {
1705 __v2du __lshift;
1706 __vector __bool long long __shmask;
1707 const __v2du __shmax = {64, 64};
1708 __v2du __result;
1709
1710 __lshift = vec_splat((__v2du)__B, 0);
1711 __shmask = vec_cmplt(__lshift, __shmax);
1712 __result = vec_sl((__v2du)__A, __lshift);
1713 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714
1715 return (__m128i)__result;
1716}
1717#endif
1718
1719extern __inline __m128i
1720 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721 _mm_sra_epi16(__m128i __A, __m128i __B) {
1722 const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723 __v8hu __rshift;
1724 __v8hi __result;
1725
1726#ifdef __LITTLE_ENDIAN__
1727 __rshift = vec_splat((__v8hu)__B, 0);
1728#else
1729 __rshift = vec_splat((__v8hu)__B, 3);
1730#endif
1731 __rshift = vec_min(__rshift, __rshmax);
1732 __result = vec_sra((__v8hi)__A, __rshift);
1733
1734 return (__m128i)__result;
1735}
1736
1737extern __inline __m128i
1738 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739 _mm_sra_epi32(__m128i __A, __m128i __B) {
1740 const __v4su __rshmax = {31, 31, 31, 31};
1741 __v4su __rshift;
1742 __v4si __result;
1743
1744#ifdef __LITTLE_ENDIAN__
1745 __rshift = vec_splat((__v4su)__B, 0);
1746#else
1747 __rshift = vec_splat((__v4su)__B, 1);
1748#endif
1749 __rshift = vec_min(__rshift, __rshmax);
1750 __result = vec_sra((__v4si)__A, __rshift);
1751
1752 return (__m128i)__result;
1753}
1754
1755extern __inline __m128i
1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_srl_epi16(__m128i __A, __m128i __B) {
1758 __v8hu __rshift;
1759 __vector __bool short __shmask;
1760 const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761 __v8hu __result;
1762
1763#ifdef __LITTLE_ENDIAN__
1764 __rshift = vec_splat((__v8hu)__B, 0);
1765#else
1766 __rshift = vec_splat((__v8hu)__B, 3);
1767#endif
1768 __shmask = vec_cmple(__rshift, __shmax);
1769 __result = vec_sr((__v8hu)__A, __rshift);
1770 __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771
1772 return (__m128i)__result;
1773}
1774
1775extern __inline __m128i
1776 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777 _mm_srl_epi32(__m128i __A, __m128i __B) {
1778 __v4su __rshift;
1779 __vector __bool int __shmask;
1780 const __v4su __shmax = {32, 32, 32, 32};
1781 __v4su __result;
1782
1783#ifdef __LITTLE_ENDIAN__
1784 __rshift = vec_splat((__v4su)__B, 0);
1785#else
1786 __rshift = vec_splat((__v4su)__B, 1);
1787#endif
1788 __shmask = vec_cmplt(__rshift, __shmax);
1789 __result = vec_sr((__v4su)__A, __rshift);
1790 __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791
1792 return (__m128i)__result;
1793}
1794
1795#ifdef _ARCH_PWR8
1796extern __inline __m128i
1797 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798 _mm_srl_epi64(__m128i __A, __m128i __B) {
1799 __v2du __rshift;
1800 __vector __bool long long __shmask;
1801 const __v2du __shmax = {64, 64};
1802 __v2du __result;
1803
1804 __rshift = vec_splat((__v2du)__B, 0);
1805 __shmask = vec_cmplt(__rshift, __shmax);
1806 __result = vec_sr((__v2du)__A, __rshift);
1807 __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808
1809 return (__m128i)__result;
1810}
1811#endif
1812
1813extern __inline __m128d
1814 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815 _mm_and_pd(__m128d __A, __m128d __B) {
1816 return (vec_and((__v2df)__A, (__v2df)__B));
1817}
1818
1819extern __inline __m128d
1820 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821 _mm_andnot_pd(__m128d __A, __m128d __B) {
1822 return (vec_andc((__v2df)__B, (__v2df)__A));
1823}
1824
1825extern __inline __m128d
1826 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827 _mm_or_pd(__m128d __A, __m128d __B) {
1828 return (vec_or((__v2df)__A, (__v2df)__B));
1829}
1830
1831extern __inline __m128d
1832 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833 _mm_xor_pd(__m128d __A, __m128d __B) {
1834 return (vec_xor((__v2df)__A, (__v2df)__B));
1835}
1836
1837extern __inline __m128i
1838 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839 _mm_and_si128(__m128i __A, __m128i __B) {
1840 return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841}
1842
1843extern __inline __m128i
1844 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845 _mm_andnot_si128(__m128i __A, __m128i __B) {
1846 return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847}
1848
1849extern __inline __m128i
1850 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851 _mm_or_si128(__m128i __A, __m128i __B) {
1852 return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853}
1854
1855extern __inline __m128i
1856 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857 _mm_xor_si128(__m128i __A, __m128i __B) {
1858 return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859}
1860
1861extern __inline __m128i
1862 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864 return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865}
1866
1867extern __inline __m128i
1868 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870 return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871}
1872
1873extern __inline __m128i
1874 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876 return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877}
1878
1879extern __inline __m128i
1880 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882 return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883}
1884
1885extern __inline __m128i
1886 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888 return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889}
1890
1891extern __inline __m128i
1892 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894 return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895}
1896
1897extern __inline __m128i
1898 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900 return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901}
1902
1903extern __inline __m128i
1904 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906 return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907}
1908
1909extern __inline __m128i
1910 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912 return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913}
1914
1915extern __inline int
1916 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_extract_epi16(__m128i const __A, int const __N) {
1918 return (unsigned short)((__v8hi)__A)[__N & 7];
1919}
1920
1921extern __inline __m128i
1922 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924 __v8hi __result = (__v8hi)__A;
1925
1926 __result[(__N & 7)] = __D;
1927
1928 return (__m128i)__result;
1929}
1930
1931extern __inline __m128i
1932 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933 _mm_max_epi16(__m128i __A, __m128i __B) {
1934 return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935}
1936
1937extern __inline __m128i
1938 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939 _mm_max_epu8(__m128i __A, __m128i __B) {
1940 return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941}
1942
1943extern __inline __m128i
1944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945 _mm_min_epi16(__m128i __A, __m128i __B) {
1946 return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947}
1948
1949extern __inline __m128i
1950 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951 _mm_min_epu8(__m128i __A, __m128i __B) {
1952 return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953}
1954
1955#ifdef _ARCH_PWR8
1956/* Intrinsic functions that require PowerISA 2.07 minimum. */
1957
1958/* Return a mask created from the most significant bit of each 8-bit
1959 element in A. */
1960extern __inline int
1961 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962 _mm_movemask_epi8(__m128i __A) {
1963#ifdef _ARCH_PWR10
1964 return vec_extractm((__v16qu)__A);
1965#else
1966 __vector unsigned long long __result;
1967 static const __vector unsigned char __perm_mask = {
1968 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970
1971 __result = ((__vector unsigned long long)vec_vbpermq(
1972 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973
1974#ifdef __LITTLE_ENDIAN__
1975 return __result[1];
1976#else
1977 return __result[0];
1978#endif
1979#endif /* !_ARCH_PWR10 */
1980}
1981#endif /* _ARCH_PWR8 */
1982
1983extern __inline __m128i
1984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985 _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986 __v4su __w0, __w1;
1987 __v16qu __xform1 = {
1988#ifdef __LITTLE_ENDIAN__
1989 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991#else
1992 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994#endif
1995 };
1996
1997 __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998 __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999 return (__m128i)vec_perm(__w0, __w1, __xform1);
2000}
2001
2002extern __inline __m128i
2003 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004 _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005 unsigned long __element_selector_98 = __mask & 0x03;
2006 unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007 unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008 unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009 static const unsigned short __permute_selectors[4] = {
2010#ifdef __LITTLE_ENDIAN__
2011 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012#else
2013 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014#endif
2015 };
2016 __v2du __pmask =
2017#ifdef __LITTLE_ENDIAN__
2018 {0x1716151413121110UL, 0UL};
2019#else
2020 {0x1011121314151617UL, 0UL};
2021#endif
2022 __m64_union __t;
2023 __v2du __a, __r;
2024
2025 __t.as_short[0] = __permute_selectors[__element_selector_98];
2026 __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027 __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028 __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029 __pmask[1] = __t.as_m64;
2030 __a = (__v2du)__A;
2031 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032 return (__m128i)__r;
2033}
2034
2035extern __inline __m128i
2036 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037 _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038 unsigned long __element_selector_10 = __mask & 0x03;
2039 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042 static const unsigned short __permute_selectors[4] = {
2043#ifdef __LITTLE_ENDIAN__
2044 0x0100, 0x0302, 0x0504, 0x0706
2045#else
2046 0x0001, 0x0203, 0x0405, 0x0607
2047#endif
2048 };
2049 __v2du __pmask =
2050#ifdef __LITTLE_ENDIAN__
2051 {0UL, 0x1f1e1d1c1b1a1918UL};
2052#else
2053 {0UL, 0x18191a1b1c1d1e1fUL};
2054#endif
2055 __m64_union __t;
2056 __v2du __a, __r;
2057 __t.as_short[0] = __permute_selectors[__element_selector_10];
2058 __t.as_short[1] = __permute_selectors[__element_selector_32];
2059 __t.as_short[2] = __permute_selectors[__element_selector_54];
2060 __t.as_short[3] = __permute_selectors[__element_selector_76];
2061 __pmask[0] = __t.as_m64;
2062 __a = (__v2du)__A;
2063 __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064 return (__m128i)__r;
2065}
2066
2067extern __inline __m128i
2068 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069 _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070 unsigned long __element_selector_10 = __mask & 0x03;
2071 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074 static const unsigned int __permute_selectors[4] = {
2075#ifdef __LITTLE_ENDIAN__
2076 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077#else
2078 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079#endif
2080 };
2081 __v4su __t;
2082
2083 __t[0] = __permute_selectors[__element_selector_10];
2084 __t[1] = __permute_selectors[__element_selector_32];
2085 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087 return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088 (__vector unsigned char)__t);
2089}
2090
2091extern __inline void
2092 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093 _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094 __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095 __v16qu __mask, __tmp;
2096 __m128i_u *__p = (__m128i_u *)__C;
2097
2098 __tmp = (__v16qu)_mm_loadu_si128(__p);
2099 __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100 __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101 _mm_storeu_si128(__p, (__m128i)__tmp);
2102}
2103
2104extern __inline __m128i
2105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106 _mm_avg_epu8(__m128i __A, __m128i __B) {
2107 return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108}
2109
2110extern __inline __m128i
2111 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112 _mm_avg_epu16(__m128i __A, __m128i __B) {
2113 return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114}
2115
2116extern __inline __m128i
2117 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118 _mm_sad_epu8(__m128i __A, __m128i __B) {
2119 __v16qu __a, __b;
2120 __v16qu __vabsdiff;
2121 __v4si __vsum;
2122 const __v4su __zero = {0, 0, 0, 0};
2123 __v4si __result;
2124
2125 __a = (__v16qu)__A;
2126 __b = (__v16qu)__B;
2127#ifndef _ARCH_PWR9
2128 __v16qu __vmin = vec_min(__a, __b);
2129 __v16qu __vmax = vec_max(__a, __b);
2130 __vabsdiff = vec_sub(__vmax, __vmin);
2131#else
2132 __vabsdiff = vec_absd(__a, __b);
2133#endif
2134 /* Sum four groups of bytes into integers. */
2135 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136#ifdef __LITTLE_ENDIAN__
2137 /* Sum across four integers with two integer results. */
2138 __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139 /* Note: vec_sum2s could be used here, but on little-endian, vector
2140 shifts are added that are not needed for this use-case.
2141 A vector shift to correctly position the 32-bit integer results
2142 (currently at [0] and [2]) to [1] and [3] would then need to be
2143 swapped back again since the desired results are two 64-bit
2144 integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
2145#else
2146 /* Sum across four integers with two integer results. */
2147 __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148 /* Rotate the sums into the correct position. */
2149 __result = vec_sld(__result, __result, 6);
2150#endif
2151 return (__m128i)__result;
2152}
2153
2154extern __inline void
2155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156 _mm_stream_si32(int *__A, int __B) {
2157 /* Use the data cache block touch for store transient. */
2158 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159 *__A = __B;
2160}
2161
2162extern __inline void
2163 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164 _mm_stream_si64(long long int *__A, long long int __B) {
2165 /* Use the data cache block touch for store transient. */
2166 __asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2167 *__A = __B;
2168}
2169
2170extern __inline void
2171 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172 _mm_stream_si128(__m128i *__A, __m128i __B) {
2173 /* Use the data cache block touch for store transient. */
2174 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175 *__A = __B;
2176}
2177
2178extern __inline void
2179 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180 _mm_stream_pd(double *__A, __m128d __B) {
2181 /* Use the data cache block touch for store transient. */
2182 __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183 *(__m128d *)__A = __B;
2184}
2185
2186extern __inline void
2187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188 _mm_clflush(void const *__A) {
2189 /* Use the data cache block flush. */
2190 __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191}
2192
2193extern __inline void
2194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195 _mm_lfence(void) {
2196 /* Use light weight sync for load to load ordering. */
2197 __atomic_thread_fence(__ATOMIC_RELEASE);
2198}
2199
2200extern __inline void
2201 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202 _mm_mfence(void) {
2203 /* Use heavy weight sync for any to any ordering. */
2204 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2205}
2206
2207extern __inline __m128i
2208 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209 _mm_cvtsi32_si128(int __A) {
2210 return _mm_set_epi32(0, 0, 0, __A);
2211}
2212
2213extern __inline __m128i
2214 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215 _mm_cvtsi64_si128(long long __A) {
2216 return __extension__(__m128i)(__v2di){__A, 0LL};
2217}
2218
2219/* Microsoft intrinsic. */
2220extern __inline __m128i
2221 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222 _mm_cvtsi64x_si128(long long __A) {
2223 return __extension__(__m128i)(__v2di){__A, 0LL};
2224}
2225
2226/* Casts between various SP, DP, INT vector types. Note that these do no
2227 conversion of values, they just change the type. */
2228extern __inline __m128
2229 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230 _mm_castpd_ps(__m128d __A) {
2231 return (__m128)__A;
2232}
2233
2234extern __inline __m128i
2235 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236 _mm_castpd_si128(__m128d __A) {
2237 return (__m128i)__A;
2238}
2239
2240extern __inline __m128d
2241 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242 _mm_castps_pd(__m128 __A) {
2243 return (__m128d)__A;
2244}
2245
2246extern __inline __m128i
2247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248 _mm_castps_si128(__m128 __A) {
2249 return (__m128i)__A;
2250}
2251
2252extern __inline __m128
2253 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254 _mm_castsi128_ps(__m128i __A) {
2255 return (__m128)__A;
2256}
2257
2258extern __inline __m128d
2259 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260 _mm_castsi128_pd(__m128i __A) {
2261 return (__m128d)__A;
2262}
2263
2264#else
2265#include_next <emmintrin.h>
2266#endif /* defined(__powerpc64__) && \
2267 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268
2269#endif /* EMMINTRIN_H_ */
__device__ int
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10393
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10979
#define vec_ctf(__a, __b)
Definition: altivec.h:3244
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:6263
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:9149
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:12781
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12149
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:10353
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:9884
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7844
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
#define vec_cts
Definition: altivec.h:3319
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:12642
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8882
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:10337
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7715
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:80
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1489
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3742
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4531
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4606
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1953
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3585
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1020
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1805
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4188
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2359
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:585
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:74
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:212
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4740
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:398
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4037
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4263
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2811
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2662
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:820
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1186
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1609
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2559
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a)
Moves the least significant 64 bits of a vector of [2 x i64] to a 64-bit signed integer value.
Definition: emmintrin.h:3410
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:3978
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1162
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3545
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1210
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2154
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1553
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3075
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3002
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1789
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3215
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1823
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2507
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:742
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2736
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:193
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:3997
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3235
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2697
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:519
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1684
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4641
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:767
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2681
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3020
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2416
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1138
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2866
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:415
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2321
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2258
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1933
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4800
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2984
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4502
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:793
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3155
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:973
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit signed integer values in the input and returns the di...
Definition: emmintrin.h:2581
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:717
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:669
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4575
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2904
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4679
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4710
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3113
static __inline__ void int __a
Definition: emmintrin.h:4057
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:153
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit unsigned integer values in the input and returns the...
Definition: emmintrin.h:2645
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4622
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4468
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed truncated (rounded towar...
Definition: emmintrin.h:1470
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3313
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a)
Returns a vector of [2 x i64] where the lower element is the input operand and the upper element is z...
Definition: emmintrin.h:3379
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4554
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3175
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:253
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1876
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4424
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1337
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3477
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4770
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1234
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2283
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2220
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3364
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1577
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3038
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2757
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition: emmintrin.h:4247
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:606
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1379
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1356
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1735
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2198
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2176
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 16-bit signed integer values in the input and returns the d...
Definition: emmintrin.h:2603
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2490
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1755
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4447
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2378
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4219
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1092
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:564
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2132
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2397
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:381
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:996
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2302
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4725
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2847
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts, with saturation, 16-bit signed integers from both 128-bit integer vector operands into 8-bi...
Definition: emmintrin.h:4142
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3689
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2885
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1593
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1068
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:648
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4755
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:2949
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2009
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2793
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2473
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:947
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:847
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2028
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1914
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3723
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition: emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2775
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2052
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:922
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:498
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3425
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:692
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4019
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3195
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4785
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1857
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:361
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3497
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1401
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2452
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3056
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:872
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into four signed truncated (rounded toward zero) 32-bit integers,...
Definition: emmintrin.h:3350
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3796
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1709
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3634
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3672
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:236
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1843
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3518
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:277
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2829
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:92
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1893
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1569
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4364
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns a vecto...
Definition: emmintrin.h:344
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2340
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4591
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3874
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:171
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:435
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1773
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1523
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:323
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1538
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3460
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3655
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3394
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2524
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2239
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2434
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:1992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4396
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:2966
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:1970
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1258
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2073
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2740
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2945
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts, with saturation, corresponding 8-bit unsigned integer values in the input and returns the ...
Definition: emmintrin.h:2624
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4298
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1294
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3858
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1116
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:541
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:456
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:3889
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:19
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2923
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3764
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1450
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2714
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:627
static __inline__ void short __D
Definition: immintrin.h:476
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19