clang 20.0.0git
xmmintrin.h
Go to the documentation of this file.
1/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18 VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications. */
31#error \
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef XMMINTRIN_H_
36#define XMMINTRIN_H_
37
38#if defined(__powerpc64__) && \
39 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40
41/* Define four value permute mask */
42#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
43
44#include <altivec.h>
45
46/* Avoid collisions between altivec.h and strict adherence to C++ and
47 C11 standards. This should eventually be done inside altivec.h itself,
48 but only after testing a full distro build. */
49#if defined(__STRICT_ANSI__) && \
50 (defined(__cplusplus) || \
51 (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
52#undef vector
53#undef pixel
54#undef bool
55#endif
56
57/* We need type definitions from the MMX header file. */
58#include <mmintrin.h>
59
60/* Get _mm_malloc () and _mm_free (). */
61#if __STDC_HOSTED__
62#include <mm_malloc.h>
63#endif
64
65/* The Intel API is flexible enough that we must allow aliasing with other
66 vector types, and their scalar components. */
67typedef vector float __m128 __attribute__((__may_alias__));
68
69/* Unaligned version of the same type. */
70typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
71
72/* Internal data types for implementing the intrinsics. */
73typedef vector float __v4sf;
74
75/* Create an undefined vector. */
76extern __inline __m128
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _mm_undefined_ps(void) {
79 __m128 __Y = __Y;
80 return __Y;
81}
82
83/* Create a vector of zeros. */
84extern __inline __m128
85 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 _mm_setzero_ps(void) {
87 return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
88}
89
90/* Load four SPFP values from P. The address must be 16-byte aligned. */
91extern __inline __m128
92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_load_ps(float const *__P) {
94 return ((__m128)vec_ld(0, (__v4sf *)__P));
95}
96
97/* Load four SPFP values from P. The address need not be 16-byte aligned. */
98extern __inline __m128
99 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 _mm_loadu_ps(float const *__P) {
101 return (vec_vsx_ld(0, __P));
102}
103
104/* Load four SPFP values in reverse order. The address must be aligned. */
105extern __inline __m128
106 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107 _mm_loadr_ps(float const *__P) {
108 __v4sf __tmp;
109 __m128 __result;
110 static const __vector unsigned char __permute_vector = {
111 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
113
114 __tmp = vec_ld(0, (__v4sf *)__P);
115 __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
116 return __result;
117}
118
119/* Create a vector with all four elements equal to F. */
120extern __inline __m128
121 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 _mm_set1_ps(float __F) {
123 return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
124}
125
126extern __inline __m128
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _mm_set_ps1(float __F) {
129 return _mm_set1_ps(__F);
130}
131
132/* Create the vector [Z Y X W]. */
133extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
134 __artificial__))
135_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
136 return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
137}
138
139/* Create the vector [W X Y Z]. */
140extern __inline __m128
141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_setr_ps(float __Z, float __Y, float __X, float __W) {
143 return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
144}
145
146/* Store four SPFP values. The address must be 16-byte aligned. */
147extern __inline void
148 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149 _mm_store_ps(float *__P, __m128 __A) {
150 vec_st((__v4sf)__A, 0, (__v4sf *)__P);
151}
152
153/* Store four SPFP values. The address need not be 16-byte aligned. */
154extern __inline void
155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 _mm_storeu_ps(float *__P, __m128 __A) {
157 *(__m128_u *)__P = __A;
158}
159
160/* Store four SPFP values in reverse order. The address must be aligned. */
161extern __inline void
162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 _mm_storer_ps(float *__P, __m128 __A) {
164 __v4sf __tmp;
165 static const __vector unsigned char __permute_vector = {
166 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
168
169 __tmp = (__m128)vec_perm(__A, __A, __permute_vector);
170
171 _mm_store_ps(__P, __tmp);
172}
173
174/* Store the lower SPFP value across four words. */
175extern __inline void
176 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm_store1_ps(float *__P, __m128 __A) {
178 __v4sf __va = vec_splat((__v4sf)__A, 0);
179 _mm_store_ps(__P, __va);
180}
181
182extern __inline void
183 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184 _mm_store_ps1(float *__P, __m128 __A) {
185 _mm_store1_ps(__P, __A);
186}
187
188/* Create a vector with element 0 as F and the rest zero. */
189extern __inline __m128
190 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 _mm_set_ss(float __F) {
192 return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
193}
194
195/* Sets the low SPFP value of A from the low value of B. */
196extern __inline __m128
197 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 _mm_move_ss(__m128 __A, __m128 __B) {
199 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
200
201 return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
202}
203
204/* Create a vector with element 0 as *P and the rest zero. */
205extern __inline __m128
206 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207 _mm_load_ss(float const *__P) {
208 return _mm_set_ss(*__P);
209}
210
211/* Stores the lower SPFP value. */
212extern __inline void
213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 _mm_store_ss(float *__P, __m128 __A) {
215 *__P = ((__v4sf)__A)[0];
216}
217
218/* Perform the respective operation on the lower SPFP (single-precision
219 floating-point) values of A and B; the upper three SPFP values are
220 passed through from A. */
221
222extern __inline __m128
223 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224 _mm_add_ss(__m128 __A, __m128 __B) {
225#ifdef _ARCH_PWR7
226 __m128 __a, __b, __c;
227 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
228 /* PowerISA VSX does not allow partial (for just lower double)
229 results. So to insure we don't generate spurious exceptions
230 (from the upper double values) we splat the lower double
231 before we to the operation. */
232 __a = vec_splat(__A, 0);
233 __b = vec_splat(__B, 0);
234 __c = __a + __b;
235 /* Then we merge the lower float result with the original upper
236 float elements from __A. */
237 return (vec_sel(__A, __c, __mask));
238#else
239 __A[0] = __A[0] + __B[0];
240 return (__A);
241#endif
242}
243
244extern __inline __m128
245 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246 _mm_sub_ss(__m128 __A, __m128 __B) {
247#ifdef _ARCH_PWR7
248 __m128 __a, __b, __c;
249 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250 /* PowerISA VSX does not allow partial (for just lower double)
251 results. So to insure we don't generate spurious exceptions
252 (from the upper double values) we splat the lower double
253 before we to the operation. */
254 __a = vec_splat(__A, 0);
255 __b = vec_splat(__B, 0);
256 __c = __a - __b;
257 /* Then we merge the lower float result with the original upper
258 float elements from __A. */
259 return (vec_sel(__A, __c, __mask));
260#else
261 __A[0] = __A[0] - __B[0];
262 return (__A);
263#endif
264}
265
266extern __inline __m128
267 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_mul_ss(__m128 __A, __m128 __B) {
269#ifdef _ARCH_PWR7
270 __m128 __a, __b, __c;
271 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272 /* PowerISA VSX does not allow partial (for just lower double)
273 results. So to insure we don't generate spurious exceptions
274 (from the upper double values) we splat the lower double
275 before we to the operation. */
276 __a = vec_splat(__A, 0);
277 __b = vec_splat(__B, 0);
278 __c = __a * __b;
279 /* Then we merge the lower float result with the original upper
280 float elements from __A. */
281 return (vec_sel(__A, __c, __mask));
282#else
283 __A[0] = __A[0] * __B[0];
284 return (__A);
285#endif
286}
287
288extern __inline __m128
289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_div_ss(__m128 __A, __m128 __B) {
291#ifdef _ARCH_PWR7
292 __m128 __a, __b, __c;
293 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294 /* PowerISA VSX does not allow partial (for just lower double)
295 results. So to insure we don't generate spurious exceptions
296 (from the upper double values) we splat the lower double
297 before we to the operation. */
298 __a = vec_splat(__A, 0);
299 __b = vec_splat(__B, 0);
300 __c = __a / __b;
301 /* Then we merge the lower float result with the original upper
302 float elements from __A. */
303 return (vec_sel(__A, __c, __mask));
304#else
305 __A[0] = __A[0] / __B[0];
306 return (__A);
307#endif
308}
309
310extern __inline __m128
311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 _mm_sqrt_ss(__m128 __A) {
313 __m128 __a, __c;
314 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
315 /* PowerISA VSX does not allow partial (for just lower double)
316 * results. So to insure we don't generate spurious exceptions
317 * (from the upper double values) we splat the lower double
318 * before we to the operation. */
319 __a = vec_splat(__A, 0);
320 __c = vec_sqrt(__a);
321 /* Then we merge the lower float result with the original upper
322 * float elements from __A. */
323 return (vec_sel(__A, __c, __mask));
324}
325
326/* Perform the respective operation on the four SPFP values in A and B. */
327extern __inline __m128
328 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm_add_ps(__m128 __A, __m128 __B) {
330 return (__m128)((__v4sf)__A + (__v4sf)__B);
331}
332
333extern __inline __m128
334 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_sub_ps(__m128 __A, __m128 __B) {
336 return (__m128)((__v4sf)__A - (__v4sf)__B);
337}
338
339extern __inline __m128
340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 _mm_mul_ps(__m128 __A, __m128 __B) {
342 return (__m128)((__v4sf)__A * (__v4sf)__B);
343}
344
345extern __inline __m128
346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm_div_ps(__m128 __A, __m128 __B) {
348 return (__m128)((__v4sf)__A / (__v4sf)__B);
349}
350
351extern __inline __m128
352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 _mm_sqrt_ps(__m128 __A) {
354 return (vec_sqrt((__v4sf)__A));
355}
356
357extern __inline __m128
358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 _mm_rcp_ps(__m128 __A) {
360 return (vec_re((__v4sf)__A));
361}
362
363extern __inline __m128
364 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 _mm_rsqrt_ps(__m128 __A) {
366 return (vec_rsqrte(__A));
367}
368
369extern __inline __m128
370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 _mm_rcp_ss(__m128 __A) {
372 __m128 __a, __c;
373 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
374 /* PowerISA VSX does not allow partial (for just lower double)
375 * results. So to insure we don't generate spurious exceptions
376 * (from the upper double values) we splat the lower double
377 * before we to the operation. */
378 __a = vec_splat(__A, 0);
379 __c = _mm_rcp_ps(__a);
380 /* Then we merge the lower float result with the original upper
381 * float elements from __A. */
382 return (vec_sel(__A, __c, __mask));
383}
384
385extern __inline __m128
386 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387 _mm_rsqrt_ss(__m128 __A) {
388 __m128 __a, __c;
389 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
390 /* PowerISA VSX does not allow partial (for just lower double)
391 * results. So to insure we don't generate spurious exceptions
392 * (from the upper double values) we splat the lower double
393 * before we to the operation. */
394 __a = vec_splat(__A, 0);
395 __c = vec_rsqrte(__a);
396 /* Then we merge the lower float result with the original upper
397 * float elements from __A. */
398 return (vec_sel(__A, __c, __mask));
399}
400
401extern __inline __m128
402 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403 _mm_min_ss(__m128 __A, __m128 __B) {
404 __v4sf __a, __b, __c;
405 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
406 /* PowerISA VSX does not allow partial (for just lower float)
407 * results. So to insure we don't generate spurious exceptions
408 * (from the upper float values) we splat the lower float
409 * before we to the operation. */
410 __a = vec_splat((__v4sf)__A, 0);
411 __b = vec_splat((__v4sf)__B, 0);
412 __c = vec_min(__a, __b);
413 /* Then we merge the lower float result with the original upper
414 * float elements from __A. */
415 return (vec_sel((__v4sf)__A, __c, __mask));
416}
417
418extern __inline __m128
419 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420 _mm_max_ss(__m128 __A, __m128 __B) {
421 __v4sf __a, __b, __c;
422 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
423 /* PowerISA VSX does not allow partial (for just lower float)
424 * results. So to insure we don't generate spurious exceptions
425 * (from the upper float values) we splat the lower float
426 * before we to the operation. */
427 __a = vec_splat(__A, 0);
428 __b = vec_splat(__B, 0);
429 __c = vec_max(__a, __b);
430 /* Then we merge the lower float result with the original upper
431 * float elements from __A. */
432 return (vec_sel((__v4sf)__A, __c, __mask));
433}
434
435extern __inline __m128
436 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437 _mm_min_ps(__m128 __A, __m128 __B) {
438 __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
439 return vec_sel(__B, __A, __m);
440}
441
442extern __inline __m128
443 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 _mm_max_ps(__m128 __A, __m128 __B) {
445 __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
446 return vec_sel(__B, __A, __m);
447}
448
449/* Perform logical bit-wise operations on 128-bit values. */
450extern __inline __m128
451 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 _mm_and_ps(__m128 __A, __m128 __B) {
453 return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
454 // return __builtin_ia32_andps (__A, __B);
455}
456
457extern __inline __m128
458 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 _mm_andnot_ps(__m128 __A, __m128 __B) {
460 return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
461}
462
463extern __inline __m128
464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_or_ps(__m128 __A, __m128 __B) {
466 return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
467}
468
469extern __inline __m128
470 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 _mm_xor_ps(__m128 __A, __m128 __B) {
472 return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
473}
474
475/* Perform a comparison on the four SPFP values of A and B. For each
476 element, if the comparison is true, place a mask of all ones in the
477 result, otherwise a mask of zeros. */
478extern __inline __m128
479 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 _mm_cmpeq_ps(__m128 __A, __m128 __B) {
481 return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
482}
483
484extern __inline __m128
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_cmplt_ps(__m128 __A, __m128 __B) {
487 return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
488}
489
490extern __inline __m128
491 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 _mm_cmple_ps(__m128 __A, __m128 __B) {
493 return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
494}
495
496extern __inline __m128
497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 _mm_cmpgt_ps(__m128 __A, __m128 __B) {
499 return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
500}
501
502extern __inline __m128
503 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 _mm_cmpge_ps(__m128 __A, __m128 __B) {
505 return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
506}
507
508extern __inline __m128
509 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 _mm_cmpneq_ps(__m128 __A, __m128 __B) {
511 __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512 return ((__m128)vec_nor(__temp, __temp));
513}
514
515extern __inline __m128
516 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 _mm_cmpnlt_ps(__m128 __A, __m128 __B) {
518 return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
519}
520
521extern __inline __m128
522 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523 _mm_cmpnle_ps(__m128 __A, __m128 __B) {
524 return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
525}
526
527extern __inline __m128
528 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpngt_ps(__m128 __A, __m128 __B) {
530 return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
531}
532
533extern __inline __m128
534 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 _mm_cmpnge_ps(__m128 __A, __m128 __B) {
536 return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
537}
538
539extern __inline __m128
540 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541 _mm_cmpord_ps(__m128 __A, __m128 __B) {
542 __vector unsigned int __a, __b;
543 __vector unsigned int __c, __d;
544 static const __vector unsigned int __float_exp_mask = {
545 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
546
547 __a = (__vector unsigned int)vec_abs((__v4sf)__A);
548 __b = (__vector unsigned int)vec_abs((__v4sf)__B);
549 __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
550 __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
551 return ((__m128)vec_and(__c, __d));
552}
553
554extern __inline __m128
555 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_cmpunord_ps(__m128 __A, __m128 __B) {
557 __vector unsigned int __a, __b;
558 __vector unsigned int __c, __d;
559 static const __vector unsigned int __float_exp_mask = {
560 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
561
562 __a = (__vector unsigned int)vec_abs((__v4sf)__A);
563 __b = (__vector unsigned int)vec_abs((__v4sf)__B);
564 __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
565 __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
566 return ((__m128)vec_or(__c, __d));
567}
568
569/* Perform a comparison on the lower SPFP values of A and B. If the
570 comparison is true, place a mask of all ones in the result, otherwise a
571 mask of zeros. The upper three SPFP values are passed through from A. */
572extern __inline __m128
573 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 _mm_cmpeq_ss(__m128 __A, __m128 __B) {
575 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
576 __v4sf __a, __b, __c;
577 /* PowerISA VMX does not allow partial (for just element 0)
578 * results. So to insure we don't generate spurious exceptions
579 * (from the upper elements) we splat the lower float
580 * before we to the operation. */
581 __a = vec_splat((__v4sf)__A, 0);
582 __b = vec_splat((__v4sf)__B, 0);
583 __c = (__v4sf)vec_cmpeq(__a, __b);
584 /* Then we merge the lower float result with the original upper
585 * float elements from __A. */
586 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
587}
588
589extern __inline __m128
590 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm_cmplt_ss(__m128 __A, __m128 __B) {
592 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
593 __v4sf __a, __b, __c;
594 /* PowerISA VMX does not allow partial (for just element 0)
595 * results. So to insure we don't generate spurious exceptions
596 * (from the upper elements) we splat the lower float
597 * before we to the operation. */
598 __a = vec_splat((__v4sf)__A, 0);
599 __b = vec_splat((__v4sf)__B, 0);
600 __c = (__v4sf)vec_cmplt(__a, __b);
601 /* Then we merge the lower float result with the original upper
602 * float elements from __A. */
603 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
604}
605
606extern __inline __m128
607 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608 _mm_cmple_ss(__m128 __A, __m128 __B) {
609 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
610 __v4sf __a, __b, __c;
611 /* PowerISA VMX does not allow partial (for just element 0)
612 * results. So to insure we don't generate spurious exceptions
613 * (from the upper elements) we splat the lower float
614 * before we to the operation. */
615 __a = vec_splat((__v4sf)__A, 0);
616 __b = vec_splat((__v4sf)__B, 0);
617 __c = (__v4sf)vec_cmple(__a, __b);
618 /* Then we merge the lower float result with the original upper
619 * float elements from __A. */
620 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
621}
622
623extern __inline __m128
624 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_cmpgt_ss(__m128 __A, __m128 __B) {
626 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
627 __v4sf __a, __b, __c;
628 /* PowerISA VMX does not allow partial (for just element 0)
629 * results. So to insure we don't generate spurious exceptions
630 * (from the upper elements) we splat the lower float
631 * before we to the operation. */
632 __a = vec_splat((__v4sf)__A, 0);
633 __b = vec_splat((__v4sf)__B, 0);
634 __c = (__v4sf)vec_cmpgt(__a, __b);
635 /* Then we merge the lower float result with the original upper
636 * float elements from __A. */
637 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
638}
639
640extern __inline __m128
641 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642 _mm_cmpge_ss(__m128 __A, __m128 __B) {
643 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
644 __v4sf __a, __b, __c;
645 /* PowerISA VMX does not allow partial (for just element 0)
646 * results. So to insure we don't generate spurious exceptions
647 * (from the upper elements) we splat the lower float
648 * before we to the operation. */
649 __a = vec_splat((__v4sf)__A, 0);
650 __b = vec_splat((__v4sf)__B, 0);
651 __c = (__v4sf)vec_cmpge(__a, __b);
652 /* Then we merge the lower float result with the original upper
653 * float elements from __A. */
654 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
655}
656
657extern __inline __m128
658 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659 _mm_cmpneq_ss(__m128 __A, __m128 __B) {
660 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
661 __v4sf __a, __b, __c;
662 /* PowerISA VMX does not allow partial (for just element 0)
663 * results. So to insure we don't generate spurious exceptions
664 * (from the upper elements) we splat the lower float
665 * before we to the operation. */
666 __a = vec_splat((__v4sf)__A, 0);
667 __b = vec_splat((__v4sf)__B, 0);
668 __c = (__v4sf)vec_cmpeq(__a, __b);
669 __c = vec_nor(__c, __c);
670 /* Then we merge the lower float result with the original upper
671 * float elements from __A. */
672 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
673}
674
675extern __inline __m128
676 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677 _mm_cmpnlt_ss(__m128 __A, __m128 __B) {
678 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
679 __v4sf __a, __b, __c;
680 /* PowerISA VMX does not allow partial (for just element 0)
681 * results. So to insure we don't generate spurious exceptions
682 * (from the upper elements) we splat the lower float
683 * before we to the operation. */
684 __a = vec_splat((__v4sf)__A, 0);
685 __b = vec_splat((__v4sf)__B, 0);
686 __c = (__v4sf)vec_cmpge(__a, __b);
687 /* Then we merge the lower float result with the original upper
688 * float elements from __A. */
689 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
690}
691
692extern __inline __m128
693 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _mm_cmpnle_ss(__m128 __A, __m128 __B) {
695 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
696 __v4sf __a, __b, __c;
697 /* PowerISA VMX does not allow partial (for just element 0)
698 * results. So to insure we don't generate spurious exceptions
699 * (from the upper elements) we splat the lower float
700 * before we to the operation. */
701 __a = vec_splat((__v4sf)__A, 0);
702 __b = vec_splat((__v4sf)__B, 0);
703 __c = (__v4sf)vec_cmpgt(__a, __b);
704 /* Then we merge the lower float result with the original upper
705 * float elements from __A. */
706 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
707}
708
709extern __inline __m128
710 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711 _mm_cmpngt_ss(__m128 __A, __m128 __B) {
712 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
713 __v4sf __a, __b, __c;
714 /* PowerISA VMX does not allow partial (for just element 0)
715 * results. So to insure we don't generate spurious exceptions
716 * (from the upper elements) we splat the lower float
717 * before we to the operation. */
718 __a = vec_splat((__v4sf)__A, 0);
719 __b = vec_splat((__v4sf)__B, 0);
720 __c = (__v4sf)vec_cmple(__a, __b);
721 /* Then we merge the lower float result with the original upper
722 * float elements from __A. */
723 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
724}
725
726extern __inline __m128
727 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 _mm_cmpnge_ss(__m128 __A, __m128 __B) {
729 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
730 __v4sf __a, __b, __c;
731 /* PowerISA VMX does not allow partial (for just element 0)
732 * results. So to insure we don't generate spurious exceptions
733 * (from the upper elements) we splat the lower float
734 * before we do the operation. */
735 __a = vec_splat((__v4sf)__A, 0);
736 __b = vec_splat((__v4sf)__B, 0);
737 __c = (__v4sf)vec_cmplt(__a, __b);
738 /* Then we merge the lower float result with the original upper
739 * float elements from __A. */
740 return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
741}
742
743extern __inline __m128
744 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745 _mm_cmpord_ss(__m128 __A, __m128 __B) {
746 __vector unsigned int __a, __b;
747 __vector unsigned int __c, __d;
748 static const __vector unsigned int __float_exp_mask = {
749 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
751
752 __a = (__vector unsigned int)vec_abs((__v4sf)__A);
753 __b = (__vector unsigned int)vec_abs((__v4sf)__B);
754 __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
755 __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
756 __c = vec_and(__c, __d);
757 /* Then we merge the lower float result with the original upper
758 * float elements from __A. */
759 return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
760}
761
762extern __inline __m128
763 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_cmpunord_ss(__m128 __A, __m128 __B) {
765 __vector unsigned int __a, __b;
766 __vector unsigned int __c, __d;
767 static const __vector unsigned int __float_exp_mask = {
768 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
770
771 __a = (__vector unsigned int)vec_abs((__v4sf)__A);
772 __b = (__vector unsigned int)vec_abs((__v4sf)__B);
773 __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
774 __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
775 __c = vec_or(__c, __d);
776 /* Then we merge the lower float result with the original upper
777 * float elements from __A. */
778 return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
779}
780
781/* Compare the lower SPFP values of A and B and return 1 if true
782 and 0 if false. */
783extern __inline int
784 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785 _mm_comieq_ss(__m128 __A, __m128 __B) {
786 return (__A[0] == __B[0]);
787}
788
789extern __inline int
790 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791 _mm_comilt_ss(__m128 __A, __m128 __B) {
792 return (__A[0] < __B[0]);
793}
794
795extern __inline int
796 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 _mm_comile_ss(__m128 __A, __m128 __B) {
798 return (__A[0] <= __B[0]);
799}
800
801extern __inline int
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _mm_comigt_ss(__m128 __A, __m128 __B) {
804 return (__A[0] > __B[0]);
805}
806
807extern __inline int
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_comige_ss(__m128 __A, __m128 __B) {
810 return (__A[0] >= __B[0]);
811}
812
813extern __inline int
814 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 _mm_comineq_ss(__m128 __A, __m128 __B) {
816 return (__A[0] != __B[0]);
817}
818
819/* FIXME
820 * The __mm_ucomi??_ss implementations below are exactly the same as
821 * __mm_comi??_ss because GCC for PowerPC only generates unordered
822 * compares (scalar and vector).
823 * Technically __mm_comieq_ss et al should be using the ordered
824 * compare and signal for QNaNs.
825 * The __mm_ucomieq_sd et all should be OK, as is.
826 */
827extern __inline int
828 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829 _mm_ucomieq_ss(__m128 __A, __m128 __B) {
830 return (__A[0] == __B[0]);
831}
832
833extern __inline int
834 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835 _mm_ucomilt_ss(__m128 __A, __m128 __B) {
836 return (__A[0] < __B[0]);
837}
838
839extern __inline int
840 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 _mm_ucomile_ss(__m128 __A, __m128 __B) {
842 return (__A[0] <= __B[0]);
843}
844
845extern __inline int
846 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 _mm_ucomigt_ss(__m128 __A, __m128 __B) {
848 return (__A[0] > __B[0]);
849}
850
851extern __inline int
852 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 _mm_ucomige_ss(__m128 __A, __m128 __B) {
854 return (__A[0] >= __B[0]);
855}
856
857extern __inline int
858 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 _mm_ucomineq_ss(__m128 __A, __m128 __B) {
860 return (__A[0] != __B[0]);
861}
862
863extern __inline float
864 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _mm_cvtss_f32(__m128 __A) {
866 return ((__v4sf)__A)[0];
867}
868
869/* Convert the lower SPFP value to a 32-bit integer according to the current
870 rounding mode. */
871extern __inline int
872 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 _mm_cvtss_si32(__m128 __A) {
874 int __res;
875#ifdef _ARCH_PWR8
876 double __dtmp;
877 __asm__(
878#ifdef __LITTLE_ENDIAN__
879 "xxsldwi %x0,%x0,%x0,3;\n"
880#endif
881 "xscvspdp %x2,%x0;\n"
882 "fctiw %2,%2;\n"
883 "mfvsrd %1,%x2;\n"
884 : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
885 :);
886#else
887 __res = __builtin_rint(__A[0]);
888#endif
889 return __res;
890}
891
892extern __inline int
893 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894 _mm_cvt_ss2si(__m128 __A) {
895 return _mm_cvtss_si32(__A);
896}
897
898/* Convert the lower SPFP value to a 32-bit integer according to the
899 current rounding mode. */
900
901/* Intel intrinsic. */
902extern __inline long long
903 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904 _mm_cvtss_si64(__m128 __A) {
905 long long __res;
906#if defined(_ARCH_PWR8) && defined(__powerpc64__)
907 double __dtmp;
908 __asm__(
909#ifdef __LITTLE_ENDIAN__
910 "xxsldwi %x0,%x0,%x0,3;\n"
911#endif
912 "xscvspdp %x2,%x0;\n"
913 "fctid %2,%2;\n"
914 "mfvsrd %1,%x2;\n"
915 : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
916 :);
917#else
918 __res = __builtin_llrint(__A[0]);
919#endif
920 return __res;
921}
922
923/* Microsoft intrinsic. */
924extern __inline long long
925 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926 _mm_cvtss_si64x(__m128 __A) {
927 return _mm_cvtss_si64((__v4sf)__A);
928}
929
930/* Constants for use with _mm_prefetch. */
931enum _mm_hint {
932 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
933 _MM_HINT_ET0 = 7,
934 _MM_HINT_ET1 = 6,
935 _MM_HINT_T0 = 3,
936 _MM_HINT_T1 = 2,
937 _MM_HINT_T2 = 1,
938 _MM_HINT_NTA = 0
939};
940
941/* Loads one cache line from address P to a location "closer" to the
942 processor. The selector I specifies the type of prefetch operation. */
943extern __inline void
944 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945 _mm_prefetch(const void *__P, enum _mm_hint __I) {
946 /* Current PowerPC will ignores the hint parameters. */
947 __builtin_prefetch(__P);
948}
949
950/* Convert the two lower SPFP values to 32-bit integers according to the
951 current rounding mode. Return the integers in packed form. */
952extern __inline __m64
953 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954 _mm_cvtps_pi32(__m128 __A) {
955 /* Splat two lower SPFP values to both halves. */
956 __v4sf __temp, __rounded;
957 __vector unsigned long long __result;
958
959 /* Splat two lower SPFP values to both halves. */
960 __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
961 __rounded = vec_rint(__temp);
962 __result = (__vector unsigned long long)vec_cts(__rounded, 0);
963
964 return (__m64)((__vector long long)__result)[0];
965}
966
967extern __inline __m64
968 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm_cvt_ps2pi(__m128 __A) {
970 return _mm_cvtps_pi32(__A);
971}
972
973/* Truncate the lower SPFP value to a 32-bit integer. */
974extern __inline int
975 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976 _mm_cvttss_si32(__m128 __A) {
977 /* Extract the lower float element. */
978 float __temp = __A[0];
979 /* truncate to 32-bit integer and return. */
980 return __temp;
981}
982
983extern __inline int
984 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985 _mm_cvtt_ss2si(__m128 __A) {
986 return _mm_cvttss_si32(__A);
987}
988
989/* Intel intrinsic. */
990extern __inline long long
991 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvttss_si64(__m128 __A) {
993 /* Extract the lower float element. */
994 float __temp = __A[0];
995 /* truncate to 32-bit integer and return. */
996 return __temp;
997}
998
999/* Microsoft intrinsic. */
1000extern __inline long long
1001 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002 _mm_cvttss_si64x(__m128 __A) {
1003 /* Extract the lower float element. */
1004 float __temp = __A[0];
1005 /* truncate to 32-bit integer and return. */
1006 return __temp;
1007}
1008
1009/* Truncate the two lower SPFP values to 32-bit integers. Return the
1010 integers in packed form. */
1011extern __inline __m64
1012 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 _mm_cvttps_pi32(__m128 __A) {
1014 __v4sf __temp;
1015 __vector unsigned long long __result;
1016
1017 /* Splat two lower SPFP values to both halves. */
1018 __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
1019 __result = (__vector unsigned long long)vec_cts(__temp, 0);
1020
1021 return (__m64)((__vector long long)__result)[0];
1022}
1023
1024extern __inline __m64
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _mm_cvtt_ps2pi(__m128 __A) {
1027 return _mm_cvttps_pi32(__A);
1028}
1029
1030/* Convert B to a SPFP value and insert it as element zero in A. */
1031extern __inline __m128
1032 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033 _mm_cvtsi32_ss(__m128 __A, int __B) {
1034 float __temp = __B;
1035 __A[0] = __temp;
1036
1037 return __A;
1038}
1039
1040extern __inline __m128
1041 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042 _mm_cvt_si2ss(__m128 __A, int __B) {
1043 return _mm_cvtsi32_ss(__A, __B);
1044}
1045
1046/* Convert B to a SPFP value and insert it as element zero in A. */
1047/* Intel intrinsic. */
1048extern __inline __m128
1049 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvtsi64_ss(__m128 __A, long long __B) {
1051 float __temp = __B;
1052 __A[0] = __temp;
1053
1054 return __A;
1055}
1056
1057/* Microsoft intrinsic. */
1058extern __inline __m128
1059 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060 _mm_cvtsi64x_ss(__m128 __A, long long __B) {
1061 return _mm_cvtsi64_ss(__A, __B);
1062}
1063
1064/* Convert the two 32-bit values in B to SPFP form and insert them
1065 as the two lower elements in A. */
1066extern __inline __m128
1067 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm_cvtpi32_ps(__m128 __A, __m64 __B) {
1069 __vector signed int __vm1;
1070 __vector float __vf1;
1071
1072 __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
1073 __vf1 = (__vector float)vec_ctf(__vm1, 0);
1074
1075 return ((__m128)(__vector unsigned long long){
1076 ((__vector unsigned long long)__vf1)[0],
1077 ((__vector unsigned long long)__A)[1]});
1078}
1079
1080extern __inline __m128
1081 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm_cvt_pi2ps(__m128 __A, __m64 __B) {
1083 return _mm_cvtpi32_ps(__A, __B);
1084}
1085
1086/* Convert the four signed 16-bit values in A to SPFP form. */
1087extern __inline __m128
1088 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089 _mm_cvtpi16_ps(__m64 __A) {
1090 __vector signed short __vs8;
1091 __vector signed int __vi4;
1092 __vector float __vf1;
1093
1094 __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
1095 __vi4 = vec_vupklsh(__vs8);
1096 __vf1 = (__vector float)vec_ctf(__vi4, 0);
1097
1098 return (__m128)__vf1;
1099}
1100
1101/* Convert the four unsigned 16-bit values in A to SPFP form. */
1102extern __inline __m128
1103 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm_cvtpu16_ps(__m64 __A) {
1105 const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106 __vector unsigned short __vs8;
1107 __vector unsigned int __vi4;
1108 __vector float __vf1;
1109
1110 __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
1111 __vi4 = (__vector unsigned int)vec_mergel
1112#ifdef __LITTLE_ENDIAN__
1113 (__vs8, __zero);
1114#else
1115 (__zero, __vs8);
1116#endif
1117 __vf1 = (__vector float)vec_ctf(__vi4, 0);
1118
1119 return (__m128)__vf1;
1120}
1121
1122/* Convert the low four signed 8-bit values in A to SPFP form. */
1123extern __inline __m128
1124 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125 _mm_cvtpi8_ps(__m64 __A) {
1126 __vector signed char __vc16;
1127 __vector signed short __vs8;
1128 __vector signed int __vi4;
1129 __vector float __vf1;
1130
1131 __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
1132 __vs8 = vec_vupkhsb(__vc16);
1133 __vi4 = vec_vupkhsh(__vs8);
1134 __vf1 = (__vector float)vec_ctf(__vi4, 0);
1135
1136 return (__m128)__vf1;
1137}
1138
1139/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1140extern __inline __m128
1141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142
1143 _mm_cvtpu8_ps(__m64 __A) {
1144 const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145 __vector unsigned char __vc16;
1146 __vector unsigned short __vs8;
1147 __vector unsigned int __vi4;
1148 __vector float __vf1;
1149
1150 __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
1151#ifdef __LITTLE_ENDIAN__
1152 __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
1153 __vi4 =
1154 (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
1155#else
1156 __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
1157 __vi4 =
1158 (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
1159#endif
1160 __vf1 = (__vector float)vec_ctf(__vi4, 0);
1161
1162 return (__m128)__vf1;
1163}
1164
1165/* Convert the four signed 32-bit values in A and B to SPFP form. */
1166extern __inline __m128
1167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
1169 __vector signed int __vi4;
1170 __vector float __vf4;
1171
1172 __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
1173 __vf4 = (__vector float)vec_ctf(__vi4, 0);
1174 return (__m128)__vf4;
1175}
1176
1177/* Convert the four SPFP values in A to four signed 16-bit integers. */
1178extern __inline __m64
1179 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180 _mm_cvtps_pi16(__m128 __A) {
1181 __v4sf __rounded;
1182 __vector signed int __temp;
1183 __vector unsigned long long __result;
1184
1185 __rounded = vec_rint(__A);
1186 __temp = vec_cts(__rounded, 0);
1187 __result = (__vector unsigned long long)vec_pack(__temp, __temp);
1188
1189 return (__m64)((__vector long long)__result)[0];
1190}
1191
1192/* Convert the four SPFP values in A to four signed 8-bit integers. */
1193extern __inline __m64
1194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm_cvtps_pi8(__m128 __A) {
1196 __v4sf __rounded;
1197 __vector signed int __tmp_i;
1198 static const __vector signed int __zero = {0, 0, 0, 0};
1199 __vector signed short __tmp_s;
1200 __vector signed char __res_v;
1201
1202 __rounded = vec_rint(__A);
1203 __tmp_i = vec_cts(__rounded, 0);
1204 __tmp_s = vec_pack(__tmp_i, __zero);
1205 __res_v = vec_pack(__tmp_s, __tmp_s);
1206 return (__m64)((__vector long long)__res_v)[0];
1207}
1208
1209/* Selects four specific SPFP values from A and B based on MASK. */
1210extern __inline __m128
1211 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212
1213 _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
1214 unsigned long __element_selector_10 = __mask & 0x03;
1215 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218 static const unsigned int __permute_selectors[4] = {
1219#ifdef __LITTLE_ENDIAN__
1220 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1221#else
1222 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1223#endif
1224 };
1225 __vector unsigned int __t;
1226
1227 __t[0] = __permute_selectors[__element_selector_10];
1228 __t[1] = __permute_selectors[__element_selector_32];
1229 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231 return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
1232}
1233
1234/* Selects and interleaves the upper two SPFP values from A and B. */
1235extern __inline __m128
1236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237 _mm_unpackhi_ps(__m128 __A, __m128 __B) {
1238 return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1239}
1240
1241/* Selects and interleaves the lower two SPFP values from A and B. */
1242extern __inline __m128
1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm_unpacklo_ps(__m128 __A, __m128 __B) {
1245 return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1246}
1247
1248/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1249 the lower two values are passed through from A. */
1250extern __inline __m128
1251 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252 _mm_loadh_pi(__m128 __A, __m64 const *__P) {
1253 __vector unsigned long long __a = (__vector unsigned long long)__A;
1254 __vector unsigned long long __p = vec_splats(*__P);
1255 __a[1] = __p[1];
1256
1257 return (__m128)__a;
1258}
1259
1260/* Stores the upper two SPFP values of A into P. */
1261extern __inline void
1262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 _mm_storeh_pi(__m64 *__P, __m128 __A) {
1264 __vector unsigned long long __a = (__vector unsigned long long)__A;
1265
1266 *__P = __a[1];
1267}
1268
1269/* Moves the upper two values of B into the lower two values of A. */
1270extern __inline __m128
1271 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_movehl_ps(__m128 __A, __m128 __B) {
1273 return (__m128)vec_mergel((__vector unsigned long long)__B,
1274 (__vector unsigned long long)__A);
1275}
1276
1277/* Moves the lower two values of B into the upper two values of A. */
1278extern __inline __m128
1279 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_movelh_ps(__m128 __A, __m128 __B) {
1281 return (__m128)vec_mergeh((__vector unsigned long long)__A,
1282 (__vector unsigned long long)__B);
1283}
1284
1285/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1286 the upper two values are passed through from A. */
1287extern __inline __m128
1288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_loadl_pi(__m128 __A, __m64 const *__P) {
1290 __vector unsigned long long __a = (__vector unsigned long long)__A;
1291 __vector unsigned long long __p = vec_splats(*__P);
1292 __a[0] = __p[0];
1293
1294 return (__m128)__a;
1295}
1296
1297/* Stores the lower two SPFP values of A into P. */
1298extern __inline void
1299 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 _mm_storel_pi(__m64 *__P, __m128 __A) {
1301 __vector unsigned long long __a = (__vector unsigned long long)__A;
1302
1303 *__P = __a[0];
1304}
1305
1306#ifdef _ARCH_PWR8
1307/* Intrinsic functions that require PowerISA 2.07 minimum. */
1308
1309/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1310extern __inline int
1311 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312 _mm_movemask_ps(__m128 __A) {
1313#ifdef _ARCH_PWR10
1314 return vec_extractm((__vector unsigned int)__A);
1315#else
1316 __vector unsigned long long __result;
1317 static const __vector unsigned int __perm_mask = {
1318#ifdef __LITTLE_ENDIAN__
1319 0x00204060, 0x80808080, 0x80808080, 0x80808080
1320#else
1321 0x80808080, 0x80808080, 0x80808080, 0x00204060
1322#endif
1323 };
1324
1325 __result = ((__vector unsigned long long)vec_vbpermq(
1326 (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1327
1328#ifdef __LITTLE_ENDIAN__
1329 return __result[1];
1330#else
1331 return __result[0];
1332#endif
1333#endif /* !_ARCH_PWR10 */
1334}
1335#endif /* _ARCH_PWR8 */
1336
1337/* Create a vector with all four elements equal to *P. */
1338extern __inline __m128
1339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340 _mm_load1_ps(float const *__P) {
1341 return _mm_set1_ps(*__P);
1342}
1343
1344extern __inline __m128
1345 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346 _mm_load_ps1(float const *__P) {
1347 return _mm_load1_ps(__P);
1348}
1349
1350/* Extracts one of the four words of A. The selector N must be immediate. */
1351extern __inline int
1352 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_extract_pi16(__m64 const __A, int const __N) {
1354 unsigned int __shiftr = __N & 3;
1355#ifdef __BIG_ENDIAN__
1356 __shiftr = 3 - __shiftr;
1357#endif
1358
1359 return ((__A >> (__shiftr * 16)) & 0xffff);
1360}
1361
1362extern __inline int
1363 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364 _m_pextrw(__m64 const __A, int const __N) {
1365 return _mm_extract_pi16(__A, __N);
1366}
1367
1368/* Inserts word D into one of four words of A. The selector N must be
1369 immediate. */
1370extern __inline __m64
1371 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372 _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
1373 const int __shiftl = (__N & 3) * 16;
1374 const __m64 __shiftD = (const __m64)__D << __shiftl;
1375 const __m64 __mask = 0xffffUL << __shiftl;
1376 __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1377
1378 return __result;
1379}
1380
1381extern __inline __m64
1382 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _m_pinsrw(__m64 const __A, int const __D, int const __N) {
1384 return _mm_insert_pi16(__A, __D, __N);
1385}
1386
1387/* Compute the element-wise maximum of signed 16-bit values. */
1388extern __inline __m64
1389 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390
1391 _mm_max_pi16(__m64 __A, __m64 __B) {
1392#if _ARCH_PWR8
1393 __vector signed short __a, __b, __r;
1394 __vector __bool short __c;
1395
1396 __a = (__vector signed short)vec_splats(__A);
1397 __b = (__vector signed short)vec_splats(__B);
1398 __c = (__vector __bool short)vec_cmpgt(__a, __b);
1399 __r = vec_sel(__b, __a, __c);
1400 return (__m64)((__vector long long)__r)[0];
1401#else
1402 __m64_union __m1, __m2, __res;
1403
1404 __m1.as_m64 = __A;
1405 __m2.as_m64 = __B;
1406
1407 __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1408 : __m2.as_short[0];
1409 __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1410 : __m2.as_short[1];
1411 __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1412 : __m2.as_short[2];
1413 __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1414 : __m2.as_short[3];
1415
1416 return (__m64)__res.as_m64;
1417#endif
1418}
1419
1420extern __inline __m64
1421 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422 _m_pmaxsw(__m64 __A, __m64 __B) {
1423 return _mm_max_pi16(__A, __B);
1424}
1425
1426/* Compute the element-wise maximum of unsigned 8-bit values. */
1427extern __inline __m64
1428 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429 _mm_max_pu8(__m64 __A, __m64 __B) {
1430#if _ARCH_PWR8
1431 __vector unsigned char __a, __b, __r;
1432 __vector __bool char __c;
1433
1434 __a = (__vector unsigned char)vec_splats(__A);
1435 __b = (__vector unsigned char)vec_splats(__B);
1436 __c = (__vector __bool char)vec_cmpgt(__a, __b);
1437 __r = vec_sel(__b, __a, __c);
1438 return (__m64)((__vector long long)__r)[0];
1439#else
1440 __m64_union __m1, __m2, __res;
1441 long __i;
1442
1443 __m1.as_m64 = __A;
1444 __m2.as_m64 = __B;
1445
1446 for (__i = 0; __i < 8; __i++)
1447 __res.as_char[__i] =
1448 ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
1449 ? __m1.as_char[__i]
1450 : __m2.as_char[__i];
1451
1452 return (__m64)__res.as_m64;
1453#endif
1454}
1455
1456extern __inline __m64
1457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458 _m_pmaxub(__m64 __A, __m64 __B) {
1459 return _mm_max_pu8(__A, __B);
1460}
1461
1462/* Compute the element-wise minimum of signed 16-bit values. */
1463extern __inline __m64
1464 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465 _mm_min_pi16(__m64 __A, __m64 __B) {
1466#if _ARCH_PWR8
1467 __vector signed short __a, __b, __r;
1468 __vector __bool short __c;
1469
1470 __a = (__vector signed short)vec_splats(__A);
1471 __b = (__vector signed short)vec_splats(__B);
1472 __c = (__vector __bool short)vec_cmplt(__a, __b);
1473 __r = vec_sel(__b, __a, __c);
1474 return (__m64)((__vector long long)__r)[0];
1475#else
1476 __m64_union __m1, __m2, __res;
1477
1478 __m1.as_m64 = __A;
1479 __m2.as_m64 = __B;
1480
1481 __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1482 : __m2.as_short[0];
1483 __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1484 : __m2.as_short[1];
1485 __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1486 : __m2.as_short[2];
1487 __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1488 : __m2.as_short[3];
1489
1490 return (__m64)__res.as_m64;
1491#endif
1492}
1493
1494extern __inline __m64
1495 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496 _m_pminsw(__m64 __A, __m64 __B) {
1497 return _mm_min_pi16(__A, __B);
1498}
1499
1500/* Compute the element-wise minimum of unsigned 8-bit values. */
1501extern __inline __m64
1502 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503 _mm_min_pu8(__m64 __A, __m64 __B) {
1504#if _ARCH_PWR8
1505 __vector unsigned char __a, __b, __r;
1506 __vector __bool char __c;
1507
1508 __a = (__vector unsigned char)vec_splats(__A);
1509 __b = (__vector unsigned char)vec_splats(__B);
1510 __c = (__vector __bool char)vec_cmplt(__a, __b);
1511 __r = vec_sel(__b, __a, __c);
1512 return (__m64)((__vector long long)__r)[0];
1513#else
1514 __m64_union __m1, __m2, __res;
1515 long __i;
1516
1517 __m1.as_m64 = __A;
1518 __m2.as_m64 = __B;
1519
1520 for (__i = 0; __i < 8; __i++)
1521 __res.as_char[__i] =
1522 ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
1523 ? __m1.as_char[__i]
1524 : __m2.as_char[__i];
1525
1526 return (__m64)__res.as_m64;
1527#endif
1528}
1529
1530extern __inline __m64
1531 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532 _m_pminub(__m64 __A, __m64 __B) {
1533 return _mm_min_pu8(__A, __B);
1534}
1535
1536/* Create an 8-bit mask of the signs of 8-bit values. */
1537extern __inline int
1538 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539 _mm_movemask_pi8(__m64 __A) {
1540#ifdef __powerpc64__
1541 unsigned long long __p =
1542#ifdef __LITTLE_ENDIAN__
1543 0x0008101820283038UL; // permute control for sign bits
1544#else
1545 0x3830282018100800UL; // permute control for sign bits
1546#endif
1547 return __builtin_bpermd(__p, __A);
1548#else
1549#ifdef __LITTLE_ENDIAN__
1550 unsigned int __mask = 0x20283038UL;
1551 unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552 unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1553#else
1554 unsigned int __mask = 0x38302820UL;
1555 unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556 unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1557#endif
1558 return (__r2 << 4) | __r1;
1559#endif
1560}
1561
1562extern __inline int
1563 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1564 _m_pmovmskb(__m64 __A) {
1565 return _mm_movemask_pi8(__A);
1566}
1567
1568/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1569 in B and produce the high 16 bits of the 32-bit results. */
1570extern __inline __m64
1571 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572 _mm_mulhi_pu16(__m64 __A, __m64 __B) {
1573 __vector unsigned short __a, __b;
1574 __vector unsigned short __c;
1575 __vector unsigned int __w0, __w1;
1576 __vector unsigned char __xform1 = {
1577#ifdef __LITTLE_ENDIAN__
1578 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1580#else
1581 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1583#endif
1584 };
1585
1586 __a = (__vector unsigned short)vec_splats(__A);
1587 __b = (__vector unsigned short)vec_splats(__B);
1588
1589 __w0 = vec_vmuleuh(__a, __b);
1590 __w1 = vec_vmulouh(__a, __b);
1591 __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
1592
1593 return (__m64)((__vector long long)__c)[0];
1594}
1595
1596extern __inline __m64
1597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1598 _m_pmulhuw(__m64 __A, __m64 __B) {
1599 return _mm_mulhi_pu16(__A, __B);
1600}
1601
1602/* Return a combination of the four 16-bit values in A. The selector
1603 must be an immediate. */
1604extern __inline __m64
1605 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606 _mm_shuffle_pi16(__m64 __A, int const __N) {
1607 unsigned long __element_selector_10 = __N & 0x03;
1608 unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609 unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610 unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611 static const unsigned short __permute_selectors[4] = {
1612#ifdef __LITTLE_ENDIAN__
1613 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614#else
1615 0x0607, 0x0405, 0x0203, 0x0001
1616#endif
1617 };
1618 __m64_union __t;
1619 __vector unsigned long long __a, __p, __r;
1620
1621#ifdef __LITTLE_ENDIAN__
1622 __t.as_short[0] = __permute_selectors[__element_selector_10];
1623 __t.as_short[1] = __permute_selectors[__element_selector_32];
1624 __t.as_short[2] = __permute_selectors[__element_selector_54];
1625 __t.as_short[3] = __permute_selectors[__element_selector_76];
1626#else
1627 __t.as_short[3] = __permute_selectors[__element_selector_10];
1628 __t.as_short[2] = __permute_selectors[__element_selector_32];
1629 __t.as_short[1] = __permute_selectors[__element_selector_54];
1630 __t.as_short[0] = __permute_selectors[__element_selector_76];
1631#endif
1632 __p = vec_splats(__t.as_m64);
1633 __a = vec_splats(__A);
1634 __r = vec_perm(__a, __a, (__vector unsigned char)__p);
1635 return (__m64)((__vector long long)__r)[0];
1636}
1637
1638extern __inline __m64
1639 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640 _m_pshufw(__m64 __A, int const __N) {
1641 return _mm_shuffle_pi16(__A, __N);
1642}
1643
1644/* Conditionally store byte elements of A into P. The high bit of each
1645 byte in the selector N determines whether the corresponding byte from
1646 A is stored. */
1647extern __inline void
1648 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649 _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
1650 __m64 __hibit = 0x8080808080808080UL;
1651 __m64 __mask, __tmp;
1652 __m64 *__p = (__m64 *)__P;
1653
1654 __tmp = *__p;
1655 __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
1656 __tmp = (__tmp & (~__mask)) | (__A & __mask);
1657 *__p = __tmp;
1658}
1659
1660extern __inline void
1661 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662 _m_maskmovq(__m64 __A, __m64 __N, char *__P) {
1663 _mm_maskmove_si64(__A, __N, __P);
1664}
1665
1666/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1667extern __inline __m64
1668 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1669 _mm_avg_pu8(__m64 __A, __m64 __B) {
1670 __vector unsigned char __a, __b, __c;
1671
1672 __a = (__vector unsigned char)vec_splats(__A);
1673 __b = (__vector unsigned char)vec_splats(__B);
1674 __c = vec_avg(__a, __b);
1675 return (__m64)((__vector long long)__c)[0];
1676}
1677
1678extern __inline __m64
1679 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680 _m_pavgb(__m64 __A, __m64 __B) {
1681 return _mm_avg_pu8(__A, __B);
1682}
1683
1684/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1685extern __inline __m64
1686 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1687 _mm_avg_pu16(__m64 __A, __m64 __B) {
1688 __vector unsigned short __a, __b, __c;
1689
1690 __a = (__vector unsigned short)vec_splats(__A);
1691 __b = (__vector unsigned short)vec_splats(__B);
1692 __c = vec_avg(__a, __b);
1693 return (__m64)((__vector long long)__c)[0];
1694}
1695
1696extern __inline __m64
1697 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1698 _m_pavgw(__m64 __A, __m64 __B) {
1699 return _mm_avg_pu16(__A, __B);
1700}
1701
1702/* Compute the sum of the absolute differences of the unsigned 8-bit
1703 values in A and B. Return the value in the lower 16-bit word; the
1704 upper words are cleared. */
1705extern __inline __m64
1706 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707 _mm_sad_pu8(__m64 __A, __m64 __B) {
1708 __vector unsigned char __a, __b;
1709 __vector unsigned char __vmin, __vmax, __vabsdiff;
1710 __vector signed int __vsum;
1711 const __vector unsigned int __zero = {0, 0, 0, 0};
1712 __m64_union __result = {0};
1713
1714 __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
1715 __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
1716 __vmin = vec_min(__a, __b);
1717 __vmax = vec_max(__a, __b);
1718 __vabsdiff = vec_sub(__vmax, __vmin);
1719 /* Sum four groups of bytes into integers. */
1720 __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
1721 /* Sum across four integers with integer result. */
1722 __vsum = vec_sums(__vsum, (__vector signed int)__zero);
1723 /* The sum is in the right most 32-bits of the vector result.
1724 Transfer to a GPR and truncate to 16 bits. */
1725 __result.as_short[0] = __vsum[3];
1726 return __result.as_m64;
1727}
1728
1729extern __inline __m64
1730 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731 _m_psadbw(__m64 __A, __m64 __B) {
1732 return _mm_sad_pu8(__A, __B);
1733}
1734
1735/* Stores the data in A to the address P without polluting the caches. */
1736extern __inline void
1737 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738 _mm_stream_pi(__m64 *__P, __m64 __A) {
1739 /* Use the data cache block touch for store transient. */
1740 __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1741 *__P = __A;
1742}
1743
1744/* Likewise. The address must be 16-byte aligned. */
1745extern __inline void
1746 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1747 _mm_stream_ps(float *__P, __m128 __A) {
1748 /* Use the data cache block touch for store transient. */
1749 __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1750 _mm_store_ps(__P, __A);
1751}
1752
1753/* Guarantees that every preceding store is globally visible before
1754 any subsequent store. */
1755extern __inline void
1756 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_sfence(void) {
1758 /* Generate a light weight sync. */
1759 __atomic_thread_fence(__ATOMIC_RELEASE);
1760}
1761
1762/* The execution of the next instruction is delayed by an implementation
1763 specific amount of time. The instruction does not modify the
1764 architectural state. This is after the pop_options pragma because
1765 it does not require SSE support in the processor--the encoding is a
1766 nop on processors that do not support it. */
1767extern __inline void
1768 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1769 _mm_pause(void) {
1770 /* There is no exact match with this construct, but the following is
1771 close to the desired effect. */
1772#if _ARCH_PWR8
1773 /* On power8 and later processors we can depend on Program Priority
1774 (PRI) and associated "very low" PPI setting. Since we don't know
1775 what PPI this thread is running at we: 1) save the current PRI
1776 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1777 via the special or 31,31,31 encoding. 3) issue an "isync" to
1778 insure the PRI change takes effect before we execute any more
1779 instructions.
1780 Now we can execute a lwsync (release barrier) while we execute
1781 this thread at "very low" PRI. Finally we restore the original
1782 PRI and continue execution. */
1783 unsigned long __PPR;
1784
1785 __asm__ volatile(" mfppr %0;"
1786 " or 31,31,31;"
1787 " isync;"
1788 " lwsync;"
1789 " isync;"
1790 " mtppr %0;"
1791 : "=r"(__PPR)
1792 :
1793 : "memory");
1794#else
1795 /* For older processor where we may not even have Program Priority
1796 controls we can only depend on Heavy Weight Sync. */
1797 __atomic_thread_fence(__ATOMIC_SEQ_CST);
1798#endif
1799}
1800
1801/* Transpose the 4x4 matrix composed of row[0-3]. */
1802#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1803 do { \
1804 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1805 __v4sf __t0 = vec_vmrghw(__r0, __r1); \
1806 __v4sf __t1 = vec_vmrghw(__r2, __r3); \
1807 __v4sf __t2 = vec_vmrglw(__r0, __r1); \
1808 __v4sf __t3 = vec_vmrglw(__r2, __r3); \
1809 (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
1810 (__vector long long)__t1); \
1811 (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
1812 (__vector long long)__t1); \
1813 (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
1814 (__vector long long)__t3); \
1815 (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
1816 (__vector long long)__t3); \
1817 } while (0)
1818
1819/* For backward source compatibility. */
1820//# include <emmintrin.h>
1821
1822#else
1823#include_next <xmmintrin.h>
1824#endif /* defined(__powerpc64__) && \
1825 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1826
1827#endif /* XMMINTRIN_H_ */
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ int
__device__ float
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:5326
#define vec_ctf(__a, __b)
Definition: altivec.h:3244
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
Definition: altivec.h:12731
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_ld(long __a, const vector signed char *__b)
Definition: altivec.h:4061
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
Definition: altivec.h:12712
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1235
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, long __b, vector signed char *__c)
Definition: altivec.h:11184
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:12487
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:882
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1586
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
Definition: altivec.h:5589
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:5091
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
Definition: altivec.h:12870
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4838
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6729
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2243
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7389
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
Definition: altivec.h:8263
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5742
#define vec_cts
Definition: altivec.h:3319
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:10090
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6865
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:117
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:13207
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
Definition: altivec.h:8541
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2369
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ uint32_t volatile uint32_t * __p
Definition: arm_acle.h:88
static __inline__ void int __a
Definition: emmintrin.h:4079
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ void short __D
Definition: immintrin.h:468
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1217
__inline unsigned int unsigned int unsigned int * __P
Definition: bmi2intrin.h:25
__inline unsigned int unsigned int __Y
Definition: bmi2intrin.h:19
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1189
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1511
#define _m_pinsrw
Definition: xmmintrin.h:3172
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
Definition: xmmintrin.h:278
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2818
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2413
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:590
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
Definition: xmmintrin.h:2573
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
Definition: xmmintrin.h:243
#define _MM_HINT_ET0
Definition: xmmintrin.h:2193
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:965
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
Definition: xmmintrin.h:542
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:568
static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
Definition: xmmintrin.h:2431
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1489
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1423
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:521
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:168
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
Definition: xmmintrin.h:1854
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
Definition: xmmintrin.h:786
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1165
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
Definition: xmmintrin.h:2187
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
Definition: xmmintrin.h:1907
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:863
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
Definition: xmmintrin.h:1988
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1284
#define _m_pmulhuw
Definition: xmmintrin.h:3178
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
Definition: xmmintrin.h:127
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1621
#define _m_pmaxub
Definition: xmmintrin.h:3174
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
Definition: xmmintrin.h:2797
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
Definition: xmmintrin.h:87
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
Definition: xmmintrin.h:1735
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:616
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
Definition: xmmintrin.h:2596
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
Definition: xmmintrin.h:106
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
Definition: xmmintrin.h:2933
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
Definition: xmmintrin.h:331
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:497
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2066
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1962
#define _m_pmaxsw
Definition: xmmintrin.h:3173
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1308
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:738
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
Definition: xmmintrin.h:2755
#define _m_pavgw
Definition: xmmintrin.h:3182
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality.
Definition: xmmintrin.h:1116
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2148
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
#define _mm_load_ps1(p)
Definition: xmmintrin.h:1840
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
Definition: xmmintrin.h:1643
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
Definition: xmmintrin.h:1944
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
Definition: xmmintrin.h:2776
#define _MM_HINT_ET1
Definition: xmmintrin.h:2194
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:226
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
Definition: xmmintrin.h:423
#define _m_pextrw
Definition: xmmintrin.h:3171
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
Definition: xmmintrin.h:314
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1356
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
Definition: xmmintrin.h:462
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1141
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
Definition: xmmintrin.h:1782
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
Definition: xmmintrin.h:2108
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2839
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
Definition: xmmintrin.h:1831
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
Definition: xmmintrin.h:377
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(void *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
Definition: xmmintrin.h:2264
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
Definition: xmmintrin.h:2245
#define _m_pavgb
Definition: xmmintrin.h:3181
#define _m_pmovmskb
Definition: xmmintrin.h:3177
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1718
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1213
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
Definition: xmmintrin.h:1401
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:688
#define _m_psadbw
Definition: xmmintrin.h:3183
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1332
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
Definition: xmmintrin.h:2957
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:2982
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:2859
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2087
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:913
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
Definition: xmmintrin.h:1755
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:295
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2394
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
Definition: xmmintrin.h:2129
#define _MM_HINT_T0
Definition: xmmintrin.h:2195
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
Definition: xmmintrin.h:2308
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:479
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
Definition: xmmintrin.h:260
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality.
Definition: xmmintrin.h:764
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
Definition: xmmintrin.h:2228
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
Definition: xmmintrin.h:1692
#define _m_pminsw
Definition: xmmintrin.h:3175
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1579
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
Definition: xmmintrin.h:1469
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts the lower (first) element of a vector of [4 x float] into a signed truncated (rounded toward...
Definition: xmmintrin.h:1533
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
Definition: xmmintrin.h:3032
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
Definition: xmmintrin.h:3007
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
Definition: xmmintrin.h:441
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
Definition: xmmintrin.h:2356
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
Definition: xmmintrin.h:1893
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1017
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
Definition: xmmintrin.h:2553
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:836
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
Definition: xmmintrin.h:2045
#define _MM_HINT_T1
Definition: xmmintrin.h:2196
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:938
#define _m_pshufw
Definition: xmmintrin.h:3179
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:990
#define _m_maskmovq
Definition: xmmintrin.h:3180
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
Definition: xmmintrin.h:2339
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
Definition: xmmintrin.h:2490
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1041
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
Definition: xmmintrin.h:2029
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:664
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
Definition: xmmintrin.h:2912
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1260
static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
Definition: xmmintrin.h:2519
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:813
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
Definition: xmmintrin.h:2168
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
Definition: xmmintrin.h:2015
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
Definition: xmmintrin.h:356
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2894
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
Definition: xmmintrin.h:2450
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:638
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
Definition: xmmintrin.h:147
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
Definition: xmmintrin.h:402
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
Definition: xmmintrin.h:2876
#define _m_pminub
Definition: xmmintrin.h:3176
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...
Definition: xmmintrin.h:208
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
Definition: xmmintrin.h:1379
#define _MM_HINT_NTA
Definition: xmmintrin.h:2198
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
Definition: xmmintrin.h:1237
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a)
Converts the lower (first) two elements of a 128-bit vector of [4 x float] into two signed truncated ...
Definition: xmmintrin.h:1600
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1927
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:886
#define _MM_HINT_T2
Definition: xmmintrin.h:2197
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:1068
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
Definition: xmmintrin.h:714
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
Definition: xmmintrin.h:2375
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
Definition: xmmintrin.h:1092
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
Definition: xmmintrin.h:1871
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
Definition: xmmintrin.h:1809
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
Definition: xmmintrin.h:187