clang 20.0.0git
mmintrin.h
Go to the documentation of this file.
1/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since PowerPC target doesn't support native 64-bit vector type, we
18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19 works well for _si64 and some _pi32 operations.
20
21 For _pi16 and _pi8 operations, it's better to transfer __m64 into
22 128-bit PowerPC vector first. Power8 introduced direct register
23 move instructions which helps for more efficient implementation.
24
25 It's user's responsibility to determine if the results of such port
26 are acceptable or further changes are needed. Please note that much
27 code using Intel intrinsics CAN BE REWRITTEN in more portable and
28 efficient standard C or GNU C extensions with 64-bit scalar
29 operations, or 128-bit SSE/Altivec operations, which are more
30 recommended. */
31#error \
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef _MMINTRIN_H_INCLUDED
36#define _MMINTRIN_H_INCLUDED
37
38#if defined(__powerpc64__) && \
39 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40
41#include <altivec.h>
42/* The Intel API is flexible enough that we must allow aliasing with other
43 vector types, and their scalar components. */
44typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45
46typedef __attribute__((__aligned__(8))) union {
47 __m64 as_m64;
48 char as_char[8];
49 signed char as_signed_char[8];
50 short as_short[4];
51 int as_int[2];
52 long long as_long_long;
53 float as_float[2];
54 double as_double;
55} __m64_union;
56
57/* Empty the multimedia state. */
58extern __inline void
59 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60 _mm_empty(void) {
61 /* nothing to do on PowerPC. */
62}
63
64extern __inline void
65 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 _m_empty(void) {
67 /* nothing to do on PowerPC. */
68}
69
70/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
71extern __inline __m64
72 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73 _mm_cvtsi32_si64(int __i) {
74 return (__m64)(unsigned int)__i;
75}
76
77extern __inline __m64
78 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79 _m_from_int(int __i) {
80 return _mm_cvtsi32_si64(__i);
81}
82
83/* Convert the lower 32 bits of the __m64 object into an integer. */
84extern __inline int
85 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 _mm_cvtsi64_si32(__m64 __i) {
87 return ((int)__i);
88}
89
90extern __inline int
91 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 _m_to_int(__m64 __i) {
93 return _mm_cvtsi64_si32(__i);
94}
95
96/* Convert I to a __m64 object. */
97
98/* Intel intrinsic. */
99extern __inline __m64
100 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 _m_from_int64(long long __i) {
102 return (__m64)__i;
103}
104
105extern __inline __m64
106 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107 _mm_cvtsi64_m64(long long __i) {
108 return (__m64)__i;
109}
110
111/* Microsoft intrinsic. */
112extern __inline __m64
113 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114 _mm_cvtsi64x_si64(long long __i) {
115 return (__m64)__i;
116}
117
118extern __inline __m64
119 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_set_pi64x(long long __i) {
121 return (__m64)__i;
122}
123
124/* Convert the __m64 object to a 64bit integer. */
125
126/* Intel intrinsic. */
127extern __inline long long
128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129 _m_to_int64(__m64 __i) {
130 return (long long)__i;
131}
132
133extern __inline long long
134 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_cvtm64_si64(__m64 __i) {
136 return (long long)__i;
137}
138
139/* Microsoft intrinsic. */
140extern __inline long long
141 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_cvtsi64_si64x(__m64 __i) {
143 return (long long)__i;
144}
145
146#ifdef _ARCH_PWR8
147/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148 the result, and the four 16-bit values from M2 into the upper four 8-bit
149 values of the result, all with signed saturation. */
150extern __inline __m64
151 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm_packs_pi16(__m64 __m1, __m64 __m2) {
153 __vector signed short __vm1;
154 __vector signed char __vresult;
155
156 __vm1 = (__vector signed short)(__vector unsigned long long)
157#ifdef __LITTLE_ENDIAN__
158 {__m1, __m2};
159#else
160 {__m2, __m1};
161#endif
162 __vresult = vec_packs(__vm1, __vm1);
163 return (__m64)((__vector long long)__vresult)[0];
164}
165
166extern __inline __m64
167 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _m_packsswb(__m64 __m1, __m64 __m2) {
169 return _mm_packs_pi16(__m1, __m2);
170}
171
172/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
173 the result, and the two 32-bit values from M2 into the upper two 16-bit
174 values of the result, all with signed saturation. */
175extern __inline __m64
176 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm_packs_pi32(__m64 __m1, __m64 __m2) {
178 __vector signed int __vm1;
179 __vector signed short __vresult;
180
181 __vm1 = (__vector signed int)(__vector unsigned long long)
182#ifdef __LITTLE_ENDIAN__
183 {__m1, __m2};
184#else
185 {__m2, __m1};
186#endif
187 __vresult = vec_packs(__vm1, __vm1);
188 return (__m64)((__vector long long)__vresult)[0];
189}
190
191extern __inline __m64
192 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193 _m_packssdw(__m64 __m1, __m64 __m2) {
194 return _mm_packs_pi32(__m1, __m2);
195}
196
197/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
198 the result, and the four 16-bit values from M2 into the upper four 8-bit
199 values of the result, all with unsigned saturation. */
200extern __inline __m64
201 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 _mm_packs_pu16(__m64 __m1, __m64 __m2) {
203 __vector unsigned char __r;
204 __vector signed short __vm1 = (__vector signed short)(__vector long long)
205#ifdef __LITTLE_ENDIAN__
206 {__m1, __m2};
207#else
208 {__m2, __m1};
209#endif
210 const __vector signed short __zero = {0};
211 __vector __bool short __select = vec_cmplt(__vm1, __zero);
212 __r =
213 vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
214 __vector __bool char __packsel = vec_pack(__select, __select);
215 __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
216 return (__m64)((__vector long long)__r)[0];
217}
218
219extern __inline __m64
220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _m_packuswb(__m64 __m1, __m64 __m2) {
222 return _mm_packs_pu16(__m1, __m2);
223}
224#endif /* end ARCH_PWR8 */
225
226/* Interleave the four 8-bit values from the high half of M1 with the four
227 8-bit values from the high half of M2. */
228extern __inline __m64
229 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
231#if _ARCH_PWR8
232 __vector unsigned char __a, __b, __c;
233
234 __a = (__vector unsigned char)vec_splats(__m1);
235 __b = (__vector unsigned char)vec_splats(__m2);
236 __c = vec_mergel(__a, __b);
237 return (__m64)((__vector long long)__c)[1];
238#else
239 __m64_union __mu1, __mu2, __res;
240
241 __mu1.as_m64 = __m1;
242 __mu2.as_m64 = __m2;
243
244 __res.as_char[0] = __mu1.as_char[4];
245 __res.as_char[1] = __mu2.as_char[4];
246 __res.as_char[2] = __mu1.as_char[5];
247 __res.as_char[3] = __mu2.as_char[5];
248 __res.as_char[4] = __mu1.as_char[6];
249 __res.as_char[5] = __mu2.as_char[6];
250 __res.as_char[6] = __mu1.as_char[7];
251 __res.as_char[7] = __mu2.as_char[7];
252
253 return (__m64)__res.as_m64;
254#endif
255}
256
257extern __inline __m64
258 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _m_punpckhbw(__m64 __m1, __m64 __m2) {
260 return _mm_unpackhi_pi8(__m1, __m2);
261}
262
263/* Interleave the two 16-bit values from the high half of M1 with the two
264 16-bit values from the high half of M2. */
265extern __inline __m64
266 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
268 __m64_union __mu1, __mu2, __res;
269
270 __mu1.as_m64 = __m1;
271 __mu2.as_m64 = __m2;
272
273 __res.as_short[0] = __mu1.as_short[2];
274 __res.as_short[1] = __mu2.as_short[2];
275 __res.as_short[2] = __mu1.as_short[3];
276 __res.as_short[3] = __mu2.as_short[3];
277
278 return (__m64)__res.as_m64;
279}
280
281extern __inline __m64
282 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _m_punpckhwd(__m64 __m1, __m64 __m2) {
284 return _mm_unpackhi_pi16(__m1, __m2);
285}
286/* Interleave the 32-bit value from the high half of M1 with the 32-bit
287 value from the high half of M2. */
288extern __inline __m64
289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
291 __m64_union __mu1, __mu2, __res;
292
293 __mu1.as_m64 = __m1;
294 __mu2.as_m64 = __m2;
295
296 __res.as_int[0] = __mu1.as_int[1];
297 __res.as_int[1] = __mu2.as_int[1];
298
299 return (__m64)__res.as_m64;
300}
301
302extern __inline __m64
303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304 _m_punpckhdq(__m64 __m1, __m64 __m2) {
305 return _mm_unpackhi_pi32(__m1, __m2);
306}
307/* Interleave the four 8-bit values from the low half of M1 with the four
308 8-bit values from the low half of M2. */
309extern __inline __m64
310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
312#if _ARCH_PWR8
313 __vector unsigned char __a, __b, __c;
314
315 __a = (__vector unsigned char)vec_splats(__m1);
316 __b = (__vector unsigned char)vec_splats(__m2);
317 __c = vec_mergel(__a, __b);
318 return (__m64)((__vector long long)__c)[0];
319#else
320 __m64_union __mu1, __mu2, __res;
321
322 __mu1.as_m64 = __m1;
323 __mu2.as_m64 = __m2;
324
325 __res.as_char[0] = __mu1.as_char[0];
326 __res.as_char[1] = __mu2.as_char[0];
327 __res.as_char[2] = __mu1.as_char[1];
328 __res.as_char[3] = __mu2.as_char[1];
329 __res.as_char[4] = __mu1.as_char[2];
330 __res.as_char[5] = __mu2.as_char[2];
331 __res.as_char[6] = __mu1.as_char[3];
332 __res.as_char[7] = __mu2.as_char[3];
333
334 return (__m64)__res.as_m64;
335#endif
336}
337
338extern __inline __m64
339 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340 _m_punpcklbw(__m64 __m1, __m64 __m2) {
341 return _mm_unpacklo_pi8(__m1, __m2);
342}
343/* Interleave the two 16-bit values from the low half of M1 with the two
344 16-bit values from the low half of M2. */
345extern __inline __m64
346 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
348 __m64_union __mu1, __mu2, __res;
349
350 __mu1.as_m64 = __m1;
351 __mu2.as_m64 = __m2;
352
353 __res.as_short[0] = __mu1.as_short[0];
354 __res.as_short[1] = __mu2.as_short[0];
355 __res.as_short[2] = __mu1.as_short[1];
356 __res.as_short[3] = __mu2.as_short[1];
357
358 return (__m64)__res.as_m64;
359}
360
361extern __inline __m64
362 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _m_punpcklwd(__m64 __m1, __m64 __m2) {
364 return _mm_unpacklo_pi16(__m1, __m2);
365}
366
367/* Interleave the 32-bit value from the low half of M1 with the 32-bit
368 value from the low half of M2. */
369extern __inline __m64
370 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
372 __m64_union __mu1, __mu2, __res;
373
374 __mu1.as_m64 = __m1;
375 __mu2.as_m64 = __m2;
376
377 __res.as_int[0] = __mu1.as_int[0];
378 __res.as_int[1] = __mu2.as_int[0];
379
380 return (__m64)__res.as_m64;
381}
382
383extern __inline __m64
384 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _m_punpckldq(__m64 __m1, __m64 __m2) {
386 return _mm_unpacklo_pi32(__m1, __m2);
387}
388
389/* Add the 8-bit values in M1 to the 8-bit values in M2. */
390extern __inline __m64
391 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_add_pi8(__m64 __m1, __m64 __m2) {
393#if _ARCH_PWR8
394 __vector signed char __a, __b, __c;
395
396 __a = (__vector signed char)vec_splats(__m1);
397 __b = (__vector signed char)vec_splats(__m2);
398 __c = vec_add(__a, __b);
399 return (__m64)((__vector long long)__c)[0];
400#else
401 __m64_union __mu1, __mu2, __res;
402
403 __mu1.as_m64 = __m1;
404 __mu2.as_m64 = __m2;
405
406 __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
407 __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
408 __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
409 __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
410 __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
411 __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
412 __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
413 __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
414
415 return (__m64)__res.as_m64;
416#endif
417}
418
419extern __inline __m64
420 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421 _m_paddb(__m64 __m1, __m64 __m2) {
422 return _mm_add_pi8(__m1, __m2);
423}
424
425/* Add the 16-bit values in M1 to the 16-bit values in M2. */
426extern __inline __m64
427 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428 _mm_add_pi16(__m64 __m1, __m64 __m2) {
429#if _ARCH_PWR8
430 __vector signed short __a, __b, __c;
431
432 __a = (__vector signed short)vec_splats(__m1);
433 __b = (__vector signed short)vec_splats(__m2);
434 __c = vec_add(__a, __b);
435 return (__m64)((__vector long long)__c)[0];
436#else
437 __m64_union __mu1, __mu2, __res;
438
439 __mu1.as_m64 = __m1;
440 __mu2.as_m64 = __m2;
441
442 __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
443 __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
444 __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
445 __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
446
447 return (__m64)__res.as_m64;
448#endif
449}
450
451extern __inline __m64
452 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453 _m_paddw(__m64 __m1, __m64 __m2) {
454 return _mm_add_pi16(__m1, __m2);
455}
456
457/* Add the 32-bit values in M1 to the 32-bit values in M2. */
458extern __inline __m64
459 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460 _mm_add_pi32(__m64 __m1, __m64 __m2) {
461#if _ARCH_PWR9
462 __vector signed int __a, __b, __c;
463
464 __a = (__vector signed int)vec_splats(__m1);
465 __b = (__vector signed int)vec_splats(__m2);
466 __c = vec_add(__a, __b);
467 return (__m64)((__vector long long)__c)[0];
468#else
469 __m64_union __mu1, __mu2, __res;
470
471 __mu1.as_m64 = __m1;
472 __mu2.as_m64 = __m2;
473
474 __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
475 __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
476
477 return (__m64)__res.as_m64;
478#endif
479}
480
481extern __inline __m64
482 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _m_paddd(__m64 __m1, __m64 __m2) {
484 return _mm_add_pi32(__m1, __m2);
485}
486
487/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
488extern __inline __m64
489 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm_sub_pi8(__m64 __m1, __m64 __m2) {
491#if _ARCH_PWR8
492 __vector signed char __a, __b, __c;
493
494 __a = (__vector signed char)vec_splats(__m1);
495 __b = (__vector signed char)vec_splats(__m2);
496 __c = vec_sub(__a, __b);
497 return (__m64)((__vector long long)__c)[0];
498#else
499 __m64_union __mu1, __mu2, __res;
500
501 __mu1.as_m64 = __m1;
502 __mu2.as_m64 = __m2;
503
504 __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
505 __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
506 __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
507 __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
508 __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
509 __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
510 __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
511 __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
512
513 return (__m64)__res.as_m64;
514#endif
515}
516
517extern __inline __m64
518 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519 _m_psubb(__m64 __m1, __m64 __m2) {
520 return _mm_sub_pi8(__m1, __m2);
521}
522
523/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
524extern __inline __m64
525 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 _mm_sub_pi16(__m64 __m1, __m64 __m2) {
527#if _ARCH_PWR8
528 __vector signed short __a, __b, __c;
529
530 __a = (__vector signed short)vec_splats(__m1);
531 __b = (__vector signed short)vec_splats(__m2);
532 __c = vec_sub(__a, __b);
533 return (__m64)((__vector long long)__c)[0];
534#else
535 __m64_union __mu1, __mu2, __res;
536
537 __mu1.as_m64 = __m1;
538 __mu2.as_m64 = __m2;
539
540 __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
541 __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
542 __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
543 __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
544
545 return (__m64)__res.as_m64;
546#endif
547}
548
549extern __inline __m64
550 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _m_psubw(__m64 __m1, __m64 __m2) {
552 return _mm_sub_pi16(__m1, __m2);
553}
554
555/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
556extern __inline __m64
557 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_sub_pi32(__m64 __m1, __m64 __m2) {
559#if _ARCH_PWR9
560 __vector signed int __a, __b, __c;
561
562 __a = (__vector signed int)vec_splats(__m1);
563 __b = (__vector signed int)vec_splats(__m2);
564 __c = vec_sub(__a, __b);
565 return (__m64)((__vector long long)__c)[0];
566#else
567 __m64_union __mu1, __mu2, __res;
568
569 __mu1.as_m64 = __m1;
570 __mu2.as_m64 = __m2;
571
572 __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
573 __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
574
575 return (__m64)__res.as_m64;
576#endif
577}
578
579extern __inline __m64
580 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581 _m_psubd(__m64 __m1, __m64 __m2) {
582 return _mm_sub_pi32(__m1, __m2);
583}
584
585extern __inline __m64
586 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 _mm_add_si64(__m64 __m1, __m64 __m2) {
588 return (__m1 + __m2);
589}
590
591extern __inline __m64
592 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 _mm_sub_si64(__m64 __m1, __m64 __m2) {
594 return (__m1 - __m2);
595}
596
597/* Shift the 64-bit value in M left by COUNT. */
598extern __inline __m64
599 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600 _mm_sll_si64(__m64 __m, __m64 __count) {
601 return (__m << __count);
602}
603
604extern __inline __m64
605 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606 _m_psllq(__m64 __m, __m64 __count) {
607 return _mm_sll_si64(__m, __count);
608}
609
610extern __inline __m64
611 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612 _mm_slli_si64(__m64 __m, const int __count) {
613 return (__m << __count);
614}
615
616extern __inline __m64
617 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618 _m_psllqi(__m64 __m, const int __count) {
619 return _mm_slli_si64(__m, __count);
620}
621
622/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
623extern __inline __m64
624 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_srl_si64(__m64 __m, __m64 __count) {
626 return (__m >> __count);
627}
628
629extern __inline __m64
630 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _m_psrlq(__m64 __m, __m64 __count) {
632 return _mm_srl_si64(__m, __count);
633}
634
635extern __inline __m64
636 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_srli_si64(__m64 __m, const int __count) {
638 return (__m >> __count);
639}
640
641extern __inline __m64
642 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _m_psrlqi(__m64 __m, const int __count) {
644 return _mm_srli_si64(__m, __count);
645}
646
647/* Bit-wise AND the 64-bit values in M1 and M2. */
648extern __inline __m64
649 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650 _mm_and_si64(__m64 __m1, __m64 __m2) {
651 return (__m1 & __m2);
652}
653
654extern __inline __m64
655 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _m_pand(__m64 __m1, __m64 __m2) {
657 return _mm_and_si64(__m1, __m2);
658}
659
660/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
661 64-bit value in M2. */
662extern __inline __m64
663 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_andnot_si64(__m64 __m1, __m64 __m2) {
665 return (~__m1 & __m2);
666}
667
668extern __inline __m64
669 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670 _m_pandn(__m64 __m1, __m64 __m2) {
671 return _mm_andnot_si64(__m1, __m2);
672}
673
674/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
675extern __inline __m64
676 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677 _mm_or_si64(__m64 __m1, __m64 __m2) {
678 return (__m1 | __m2);
679}
680
681extern __inline __m64
682 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 _m_por(__m64 __m1, __m64 __m2) {
684 return _mm_or_si64(__m1, __m2);
685}
686
687/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
688extern __inline __m64
689 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690 _mm_xor_si64(__m64 __m1, __m64 __m2) {
691 return (__m1 ^ __m2);
692}
693
694extern __inline __m64
695 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696 _m_pxor(__m64 __m1, __m64 __m2) {
697 return _mm_xor_si64(__m1, __m2);
698}
699
700/* Creates a 64-bit zero. */
701extern __inline __m64
702 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm_setzero_si64(void) {
704 return (__m64)0;
705}
706
707/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
708 test is true and zero if false. */
709extern __inline __m64
710 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
712#if defined(_ARCH_PWR6) && defined(__powerpc64__)
713 __m64 __res;
714 __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
715 return (__res);
716#else
717 __m64_union __mu1, __mu2, __res;
718
719 __mu1.as_m64 = __m1;
720 __mu2.as_m64 = __m2;
721
722 __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
723 __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
724 __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
725 __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
726 __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
727 __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
728 __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
729 __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
730
731 return (__m64)__res.as_m64;
732#endif
733}
734
735extern __inline __m64
736 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 _m_pcmpeqb(__m64 __m1, __m64 __m2) {
738 return _mm_cmpeq_pi8(__m1, __m2);
739}
740
741extern __inline __m64
742 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
744#if _ARCH_PWR8
745 __vector signed char __a, __b, __c;
746
747 __a = (__vector signed char)vec_splats(__m1);
748 __b = (__vector signed char)vec_splats(__m2);
749 __c = (__vector signed char)vec_cmpgt(__a, __b);
750 return (__m64)((__vector long long)__c)[0];
751#else
752 __m64_union __mu1, __mu2, __res;
753
754 __mu1.as_m64 = __m1;
755 __mu2.as_m64 = __m2;
756
757 __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
758 __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
759 __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
760 __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
761 __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
762 __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
763 __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
764 __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
765
766 return (__m64)__res.as_m64;
767#endif
768}
769
770extern __inline __m64
771 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _m_pcmpgtb(__m64 __m1, __m64 __m2) {
773 return _mm_cmpgt_pi8(__m1, __m2);
774}
775
776/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
777 the test is true and zero if false. */
778extern __inline __m64
779 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
781#if _ARCH_PWR8
782 __vector signed short __a, __b, __c;
783
784 __a = (__vector signed short)vec_splats(__m1);
785 __b = (__vector signed short)vec_splats(__m2);
786 __c = (__vector signed short)vec_cmpeq(__a, __b);
787 return (__m64)((__vector long long)__c)[0];
788#else
789 __m64_union __mu1, __mu2, __res;
790
791 __mu1.as_m64 = __m1;
792 __mu2.as_m64 = __m2;
793
794 __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
795 __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
796 __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
797 __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
798
799 return (__m64)__res.as_m64;
800#endif
801}
802
803extern __inline __m64
804 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805 _m_pcmpeqw(__m64 __m1, __m64 __m2) {
806 return _mm_cmpeq_pi16(__m1, __m2);
807}
808
809extern __inline __m64
810 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
812#if _ARCH_PWR8
813 __vector signed short __a, __b, __c;
814
815 __a = (__vector signed short)vec_splats(__m1);
816 __b = (__vector signed short)vec_splats(__m2);
817 __c = (__vector signed short)vec_cmpgt(__a, __b);
818 return (__m64)((__vector long long)__c)[0];
819#else
820 __m64_union __mu1, __mu2, __res;
821
822 __mu1.as_m64 = __m1;
823 __mu2.as_m64 = __m2;
824
825 __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
826 __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
827 __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
828 __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
829
830 return (__m64)__res.as_m64;
831#endif
832}
833
834extern __inline __m64
835 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836 _m_pcmpgtw(__m64 __m1, __m64 __m2) {
837 return _mm_cmpgt_pi16(__m1, __m2);
838}
839
840/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
841 the test is true and zero if false. */
842extern __inline __m64
843 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
845#if _ARCH_PWR9
846 __vector signed int __a, __b, __c;
847
848 __a = (__vector signed int)vec_splats(__m1);
849 __b = (__vector signed int)vec_splats(__m2);
850 __c = (__vector signed int)vec_cmpeq(__a, __b);
851 return (__m64)((__vector long long)__c)[0];
852#else
853 __m64_union __mu1, __mu2, __res;
854
855 __mu1.as_m64 = __m1;
856 __mu2.as_m64 = __m2;
857
858 __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
859 __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
860
861 return (__m64)__res.as_m64;
862#endif
863}
864
865extern __inline __m64
866 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867 _m_pcmpeqd(__m64 __m1, __m64 __m2) {
868 return _mm_cmpeq_pi32(__m1, __m2);
869}
870
871extern __inline __m64
872 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
874#if _ARCH_PWR9
875 __vector signed int __a, __b, __c;
876
877 __a = (__vector signed int)vec_splats(__m1);
878 __b = (__vector signed int)vec_splats(__m2);
879 __c = (__vector signed int)vec_cmpgt(__a, __b);
880 return (__m64)((__vector long long)__c)[0];
881#else
882 __m64_union __mu1, __mu2, __res;
883
884 __mu1.as_m64 = __m1;
885 __mu2.as_m64 = __m2;
886
887 __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
888 __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
889
890 return (__m64)__res.as_m64;
891#endif
892}
893
894extern __inline __m64
895 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896 _m_pcmpgtd(__m64 __m1, __m64 __m2) {
897 return _mm_cmpgt_pi32(__m1, __m2);
898}
899
900#if _ARCH_PWR8
901/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
902 saturated arithmetic. */
903extern __inline __m64
904 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905 _mm_adds_pi8(__m64 __m1, __m64 __m2) {
906 __vector signed char __a, __b, __c;
907
908 __a = (__vector signed char)vec_splats(__m1);
909 __b = (__vector signed char)vec_splats(__m2);
910 __c = vec_adds(__a, __b);
911 return (__m64)((__vector long long)__c)[0];
912}
913
914extern __inline __m64
915 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _m_paddsb(__m64 __m1, __m64 __m2) {
917 return _mm_adds_pi8(__m1, __m2);
918}
919/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
920 saturated arithmetic. */
921extern __inline __m64
922 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923 _mm_adds_pi16(__m64 __m1, __m64 __m2) {
924 __vector signed short __a, __b, __c;
925
926 __a = (__vector signed short)vec_splats(__m1);
927 __b = (__vector signed short)vec_splats(__m2);
928 __c = vec_adds(__a, __b);
929 return (__m64)((__vector long long)__c)[0];
930}
931
932extern __inline __m64
933 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934 _m_paddsw(__m64 __m1, __m64 __m2) {
935 return _mm_adds_pi16(__m1, __m2);
936}
937/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
938 saturated arithmetic. */
939extern __inline __m64
940 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm_adds_pu8(__m64 __m1, __m64 __m2) {
942 __vector unsigned char __a, __b, __c;
943
944 __a = (__vector unsigned char)vec_splats(__m1);
945 __b = (__vector unsigned char)vec_splats(__m2);
946 __c = vec_adds(__a, __b);
947 return (__m64)((__vector long long)__c)[0];
948}
949
950extern __inline __m64
951 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952 _m_paddusb(__m64 __m1, __m64 __m2) {
953 return _mm_adds_pu8(__m1, __m2);
954}
955
956/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
957 saturated arithmetic. */
958extern __inline __m64
959 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm_adds_pu16(__m64 __m1, __m64 __m2) {
961 __vector unsigned short __a, __b, __c;
962
963 __a = (__vector unsigned short)vec_splats(__m1);
964 __b = (__vector unsigned short)vec_splats(__m2);
965 __c = vec_adds(__a, __b);
966 return (__m64)((__vector long long)__c)[0];
967}
968
969extern __inline __m64
970 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971 _m_paddusw(__m64 __m1, __m64 __m2) {
972 return _mm_adds_pu16(__m1, __m2);
973}
974
975/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
976 saturating arithmetic. */
977extern __inline __m64
978 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
979 _mm_subs_pi8(__m64 __m1, __m64 __m2) {
980 __vector signed char __a, __b, __c;
981
982 __a = (__vector signed char)vec_splats(__m1);
983 __b = (__vector signed char)vec_splats(__m2);
984 __c = vec_subs(__a, __b);
985 return (__m64)((__vector long long)__c)[0];
986}
987
988extern __inline __m64
989 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _m_psubsb(__m64 __m1, __m64 __m2) {
991 return _mm_subs_pi8(__m1, __m2);
992}
993
994/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
995 signed saturating arithmetic. */
996extern __inline __m64
997 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998 _mm_subs_pi16(__m64 __m1, __m64 __m2) {
999 __vector signed short __a, __b, __c;
1000
1001 __a = (__vector signed short)vec_splats(__m1);
1002 __b = (__vector signed short)vec_splats(__m2);
1003 __c = vec_subs(__a, __b);
1004 return (__m64)((__vector long long)__c)[0];
1005}
1006
1007extern __inline __m64
1008 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009 _m_psubsw(__m64 __m1, __m64 __m2) {
1010 return _mm_subs_pi16(__m1, __m2);
1011}
1012
1013/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014 unsigned saturating arithmetic. */
1015extern __inline __m64
1016 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018 __vector unsigned char __a, __b, __c;
1019
1020 __a = (__vector unsigned char)vec_splats(__m1);
1021 __b = (__vector unsigned char)vec_splats(__m2);
1022 __c = vec_subs(__a, __b);
1023 return (__m64)((__vector long long)__c)[0];
1024}
1025
1026extern __inline __m64
1027 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028 _m_psubusb(__m64 __m1, __m64 __m2) {
1029 return _mm_subs_pu8(__m1, __m2);
1030}
1031
1032/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033 unsigned saturating arithmetic. */
1034extern __inline __m64
1035 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037 __vector unsigned short __a, __b, __c;
1038
1039 __a = (__vector unsigned short)vec_splats(__m1);
1040 __b = (__vector unsigned short)vec_splats(__m2);
1041 __c = vec_subs(__a, __b);
1042 return (__m64)((__vector long long)__c)[0];
1043}
1044
1045extern __inline __m64
1046 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _m_psubusw(__m64 __m1, __m64 __m2) {
1048 return _mm_subs_pu16(__m1, __m2);
1049}
1050
1051/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052 four 32-bit intermediate results, which are then summed by pairs to
1053 produce two 32-bit results. */
1054extern __inline __m64
1055 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057 __vector signed short __a, __b;
1058 __vector signed int __c;
1059 __vector signed int __zero = {0, 0, 0, 0};
1060
1061 __a = (__vector signed short)vec_splats(__m1);
1062 __b = (__vector signed short)vec_splats(__m2);
1063 __c = vec_vmsumshm(__a, __b, __zero);
1064 return (__m64)((__vector long long)__c)[0];
1065}
1066
1067extern __inline __m64
1068 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069 _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070 return _mm_madd_pi16(__m1, __m2);
1071}
1072/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073 M2 and produce the high 16 bits of the 32-bit results. */
1074extern __inline __m64
1075 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077 __vector signed short __a, __b;
1078 __vector signed short __c;
1079 __vector signed int __w0, __w1;
1080 __vector unsigned char __xform1 = {
1081#ifdef __LITTLE_ENDIAN__
1082 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084#else
1085 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087#endif
1088 };
1089
1090 __a = (__vector signed short)vec_splats(__m1);
1091 __b = (__vector signed short)vec_splats(__m2);
1092
1093 __w0 = vec_vmulesh(__a, __b);
1094 __w1 = vec_vmulosh(__a, __b);
1095 __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096
1097 return (__m64)((__vector long long)__c)[0];
1098}
1099
1100extern __inline __m64
1101 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _m_pmulhw(__m64 __m1, __m64 __m2) {
1103 return _mm_mulhi_pi16(__m1, __m2);
1104}
1105
1106/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107 the low 16 bits of the results. */
1108extern __inline __m64
1109 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111 __vector signed short __a, __b, __c;
1112
1113 __a = (__vector signed short)vec_splats(__m1);
1114 __b = (__vector signed short)vec_splats(__m2);
1115 __c = __a * __b;
1116 return (__m64)((__vector long long)__c)[0];
1117}
1118
1119extern __inline __m64
1120 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 _m_pmullw(__m64 __m1, __m64 __m2) {
1122 return _mm_mullo_pi16(__m1, __m2);
1123}
1124
1125/* Shift four 16-bit values in M left by COUNT. */
1126extern __inline __m64
1127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_sll_pi16(__m64 __m, __m64 __count) {
1129 __vector signed short __r;
1130 __vector unsigned short __c;
1131
1132 if (__count <= 15) {
1133 __r = (__vector signed short)vec_splats(__m);
1134 __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135 __r = vec_sl(__r, (__vector unsigned short)__c);
1136 return (__m64)((__vector long long)__r)[0];
1137 } else
1138 return (0);
1139}
1140
1141extern __inline __m64
1142 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143 _m_psllw(__m64 __m, __m64 __count) {
1144 return _mm_sll_pi16(__m, __count);
1145}
1146
1147extern __inline __m64
1148 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149 _mm_slli_pi16(__m64 __m, int __count) {
1150 /* Promote int to long then invoke mm_sll_pi16. */
1151 return _mm_sll_pi16(__m, __count);
1152}
1153
1154extern __inline __m64
1155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156 _m_psllwi(__m64 __m, int __count) {
1157 return _mm_slli_pi16(__m, __count);
1158}
1159
1160/* Shift two 32-bit values in M left by COUNT. */
1161extern __inline __m64
1162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163 _mm_sll_pi32(__m64 __m, __m64 __count) {
1164 __m64_union __res;
1165
1166 __res.as_m64 = __m;
1167
1168 __res.as_int[0] = __res.as_int[0] << __count;
1169 __res.as_int[1] = __res.as_int[1] << __count;
1170 return (__res.as_m64);
1171}
1172
1173extern __inline __m64
1174 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175 _m_pslld(__m64 __m, __m64 __count) {
1176 return _mm_sll_pi32(__m, __count);
1177}
1178
1179extern __inline __m64
1180 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181 _mm_slli_pi32(__m64 __m, int __count) {
1182 /* Promote int to long then invoke mm_sll_pi32. */
1183 return _mm_sll_pi32(__m, __count);
1184}
1185
1186extern __inline __m64
1187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _m_pslldi(__m64 __m, int __count) {
1189 return _mm_slli_pi32(__m, __count);
1190}
1191
1192/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1193extern __inline __m64
1194 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195 _mm_sra_pi16(__m64 __m, __m64 __count) {
1196 __vector signed short __r;
1197 __vector unsigned short __c;
1198
1199 if (__count <= 15) {
1200 __r = (__vector signed short)vec_splats(__m);
1201 __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202 __r = vec_sra(__r, (__vector unsigned short)__c);
1203 return (__m64)((__vector long long)__r)[0];
1204 } else
1205 return (0);
1206}
1207
1208extern __inline __m64
1209 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210 _m_psraw(__m64 __m, __m64 __count) {
1211 return _mm_sra_pi16(__m, __count);
1212}
1213
1214extern __inline __m64
1215 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216 _mm_srai_pi16(__m64 __m, int __count) {
1217 /* Promote int to long then invoke mm_sra_pi32. */
1218 return _mm_sra_pi16(__m, __count);
1219}
1220
1221extern __inline __m64
1222 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223 _m_psrawi(__m64 __m, int __count) {
1224 return _mm_srai_pi16(__m, __count);
1225}
1226
1227/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1228extern __inline __m64
1229 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230 _mm_sra_pi32(__m64 __m, __m64 __count) {
1231 __m64_union __res;
1232
1233 __res.as_m64 = __m;
1234
1235 __res.as_int[0] = __res.as_int[0] >> __count;
1236 __res.as_int[1] = __res.as_int[1] >> __count;
1237 return (__res.as_m64);
1238}
1239
1240extern __inline __m64
1241 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242 _m_psrad(__m64 __m, __m64 __count) {
1243 return _mm_sra_pi32(__m, __count);
1244}
1245
1246extern __inline __m64
1247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248 _mm_srai_pi32(__m64 __m, int __count) {
1249 /* Promote int to long then invoke mm_sra_pi32. */
1250 return _mm_sra_pi32(__m, __count);
1251}
1252
1253extern __inline __m64
1254 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255 _m_psradi(__m64 __m, int __count) {
1256 return _mm_srai_pi32(__m, __count);
1257}
1258
1259/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1260extern __inline __m64
1261 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_srl_pi16(__m64 __m, __m64 __count) {
1263 __vector unsigned short __r;
1264 __vector unsigned short __c;
1265
1266 if (__count <= 15) {
1267 __r = (__vector unsigned short)vec_splats(__m);
1268 __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269 __r = vec_sr(__r, (__vector unsigned short)__c);
1270 return (__m64)((__vector long long)__r)[0];
1271 } else
1272 return (0);
1273}
1274
1275extern __inline __m64
1276 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277 _m_psrlw(__m64 __m, __m64 __count) {
1278 return _mm_srl_pi16(__m, __count);
1279}
1280
1281extern __inline __m64
1282 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm_srli_pi16(__m64 __m, int __count) {
1284 /* Promote int to long then invoke mm_sra_pi32. */
1285 return _mm_srl_pi16(__m, __count);
1286}
1287
1288extern __inline __m64
1289 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _m_psrlwi(__m64 __m, int __count) {
1291 return _mm_srli_pi16(__m, __count);
1292}
1293
1294/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1295extern __inline __m64
1296 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_srl_pi32(__m64 __m, __m64 __count) {
1298 __m64_union __res;
1299
1300 __res.as_m64 = __m;
1301
1302 __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303 __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304 return (__res.as_m64);
1305}
1306
1307extern __inline __m64
1308 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _m_psrld(__m64 __m, __m64 __count) {
1310 return _mm_srl_pi32(__m, __count);
1311}
1312
1313extern __inline __m64
1314 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_srli_pi32(__m64 __m, int __count) {
1316 /* Promote int to long then invoke mm_srl_pi32. */
1317 return _mm_srl_pi32(__m, __count);
1318}
1319
1320extern __inline __m64
1321 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322 _m_psrldi(__m64 __m, int __count) {
1323 return _mm_srli_pi32(__m, __count);
1324}
1325#endif /* _ARCH_PWR8 */
1326
1327/* Creates a vector of two 32-bit values; I0 is least significant. */
1328extern __inline __m64
1329 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm_set_pi32(int __i1, int __i0) {
1331 __m64_union __res;
1332
1333 __res.as_int[0] = __i0;
1334 __res.as_int[1] = __i1;
1335 return (__res.as_m64);
1336}
1337
1338/* Creates a vector of four 16-bit values; W0 is least significant. */
1339extern __inline __m64
1340 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342 __m64_union __res;
1343
1344 __res.as_short[0] = __w0;
1345 __res.as_short[1] = __w1;
1346 __res.as_short[2] = __w2;
1347 __res.as_short[3] = __w3;
1348 return (__res.as_m64);
1349}
1350
1351/* Creates a vector of eight 8-bit values; B0 is least significant. */
1352extern __inline __m64
1353 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355 char __b2, char __b1, char __b0) {
1356 __m64_union __res;
1357
1358 __res.as_char[0] = __b0;
1359 __res.as_char[1] = __b1;
1360 __res.as_char[2] = __b2;
1361 __res.as_char[3] = __b3;
1362 __res.as_char[4] = __b4;
1363 __res.as_char[5] = __b5;
1364 __res.as_char[6] = __b6;
1365 __res.as_char[7] = __b7;
1366 return (__res.as_m64);
1367}
1368
1369/* Similar, but with the arguments in reverse order. */
1370extern __inline __m64
1371 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372 _mm_setr_pi32(int __i0, int __i1) {
1373 __m64_union __res;
1374
1375 __res.as_int[0] = __i0;
1376 __res.as_int[1] = __i1;
1377 return (__res.as_m64);
1378}
1379
1380extern __inline __m64
1381 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383 return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384}
1385
1386extern __inline __m64
1387 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389 char __b5, char __b6, char __b7) {
1390 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391}
1392
1393/* Creates a vector of two 32-bit values, both elements containing I. */
1394extern __inline __m64
1395 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396 _mm_set1_pi32(int __i) {
1397 __m64_union __res;
1398
1399 __res.as_int[0] = __i;
1400 __res.as_int[1] = __i;
1401 return (__res.as_m64);
1402}
1403
1404/* Creates a vector of four 16-bit values, all elements containing W. */
1405extern __inline __m64
1406 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407 _mm_set1_pi16(short __w) {
1408#if _ARCH_PWR9
1409 __vector signed short w;
1410
1411 w = (__vector signed short)vec_splats(__w);
1412 return (__m64)((__vector long long)w)[0];
1413#else
1414 __m64_union __res;
1415
1416 __res.as_short[0] = __w;
1417 __res.as_short[1] = __w;
1418 __res.as_short[2] = __w;
1419 __res.as_short[3] = __w;
1420 return (__res.as_m64);
1421#endif
1422}
1423
1424/* Creates a vector of eight 8-bit values, all elements containing B. */
1425extern __inline __m64
1426 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm_set1_pi8(signed char __b) {
1428#if _ARCH_PWR8
1429 __vector signed char __res;
1430
1431 __res = (__vector signed char)vec_splats(__b);
1432 return (__m64)((__vector long long)__res)[0];
1433#else
1434 __m64_union __res;
1435
1436 __res.as_char[0] = __b;
1437 __res.as_char[1] = __b;
1438 __res.as_char[2] = __b;
1439 __res.as_char[3] = __b;
1440 __res.as_char[4] = __b;
1441 __res.as_char[5] = __b;
1442 __res.as_char[6] = __b;
1443 __res.as_char[7] = __b;
1444 return (__res.as_m64);
1445#endif
1446}
1447
1448#else
1449#include_next <mmintrin.h>
1450#endif /* defined(__powerpc64__) && \
1451 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452
1453#endif /* _MMINTRIN_H_INCLUDED */
__device__ int
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:10393
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1708
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:10527
static __inline__ vector float vector float vector float __c
Definition: altivec.h:4800
static __inline__ vector float vector float __b
Definition: altivec.h:578
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:14737
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:5361
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:12149
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:626
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7962
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:8588
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2435
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
Definition: altivec.h:7389
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8882
static __inline__ vector signed char __ATTRS_o_ai vec_add(vector signed char __a, vector signed char __b)
Definition: altivec.h:200
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2131
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7715
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:11869
static __inline__ void int __a
Definition: emmintrin.h:4079
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b)
Subtracts signed or unsigned 64-bit integer values and writes the difference to the corresponding bit...
Definition: emmintrin.h:2560
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b)
Adds two signed or unsigned 64-bit integer values, returning the lower 64 bits of the sum.
Definition: emmintrin.h:2130
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, __m64 __m2)
Performs a bitwise NOT of the first 64-bit integer vector, and then performs a bitwise AND of the int...
Definition: mmintrin.h:1159
#define _m_empty
Definition: mmintrin.h:1551
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, __m64 __m2)
Adds each 32-bit integer element of the first 64-bit integer vector of [2 x i32] to the corresponding...
Definition: mmintrin.h:425
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, __m64 __m2)
Performs a bitwise AND of two 64-bit integer vectors.
Definition: mmintrin.h:1138
#define _m_pcmpeqd
Definition: mmintrin.h:1604
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, __m64 __m2)
Adds each 8-bit integer element of the first 64-bit integer vector of [8 x i8] to the corresponding 8...
Definition: mmintrin.h:383
#define _m_pand
Definition: mmintrin.h:1598
#define _m_pslld
Definition: mmintrin.h:1584
#define _m_pcmpgtd
Definition: mmintrin.h:1607
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, __m64 __m2)
Adds each 16-bit integer element of the first 64-bit integer vector of [4 x i16] to the corresponding...
Definition: mmintrin.h:404
#define _m_pcmpgtb
Definition: mmintrin.h:1605
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, __m64 __m2)
Adds, with saturation, each 16-bit unsigned integer element of the first 64-bit integer vector of [4 ...
Definition: mmintrin.h:519
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, __m64 __m2)
Converts, with saturation, 32-bit signed integers from both 64-bit integer vector parameters of [2 x ...
Definition: mmintrin.h:189
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1261
#define _m_psrlwi
Definition: mmintrin.h:1593
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 8-bit integer val...
Definition: mmintrin.h:1541
#define _m_psllq
Definition: mmintrin.h:1586
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set1_pi8(char __b)
Constructs a 64-bit integer vector of [8 x i8], with each of the 8-bit integer vector elements set to...
Definition: mmintrin.h:1469
#define _m_packuswb
Definition: mmintrin.h:1558
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setr_pi32(int __i0, int __i1)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 32-bit integer va...
Definition: mmintrin.h:1489
#define _m_psllwi
Definition: mmintrin.h:1583
#define _m_packsswb
Definition: mmintrin.h:1556
#define _m_to_int64
Definition: mmintrin.h:1555
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:340
#define _m_paddb
Definition: mmintrin.h:1565
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:163
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit integer element of the second 64-bit integer vector of [8 x i8] from the corresp...
Definition: mmintrin.h:540
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
Constructs a 64-bit integer vector, initialized in reverse order with the specified 16-bit integer va...
Definition: mmintrin.h:1511
#define _m_paddusw
Definition: mmintrin.h:1571
#define _m_psubusb
Definition: mmintrin.h:1577
#define _m_to_int
Definition: mmintrin.h:1554
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:982
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, __m64 __m2)
Adds, with saturation, each 8-bit unsigned integer element of the first 64-bit integer vector of [8 x...
Definition: mmintrin.h:496
#define _m_punpckhdq
Definition: mmintrin.h:1561
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
Constructs a 64-bit integer vector initialized with the specified 16-bit integer values.
Definition: mmintrin.h:1384
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:1029
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, int __count)
Right-shifts each 16-bit integer element of a 64-bit integer vector of [4 x i16] by the number of bit...
Definition: mmintrin.h:933
#define _m_pcmpeqw
Definition: mmintrin.h:1603
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, __m64 __m2)
Subtracts each 8-bit unsigned integer element of the second 64-bit integer vector of [8 x i8] from th...
Definition: mmintrin.h:654
#define _m_psllw
Definition: mmintrin.h:1582
#define _m_por
Definition: mmintrin.h:1600
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, __m64 __m2)
Adds, with saturation, each 8-bit signed integer element of the first 64-bit integer vector of [8 x i...
Definition: mmintrin.h:449
#define _m_punpckhwd
Definition: mmintrin.h:1560
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, __m64 __m2)
Performs a bitwise OR of two 64-bit integer vectors.
Definition: mmintrin.h:1177
static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m)
Casts a 64-bit integer vector into a 64-bit signed integer value.
Definition: mmintrin.h:138
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, int __count)
Right-shifts each 32-bit integer element of a 64-bit integer vector of [2 x i32] by the number of bit...
Definition: mmintrin.h:1076
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set1_pi32(int __i)
Constructs a 64-bit integer vector of [2 x i32], with each of the 32-bit integer vector elements set ...
Definition: mmintrin.h:1434
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, __m64 __m2)
Adds, with saturation, each 16-bit signed integer element of the first 64-bit integer vector of [4 x ...
Definition: mmintrin.h:473
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i)
Constructs a 64-bit integer vector, setting the lower 32 bits to the value of the 32-bit integer para...
Definition: mmintrin.h:89
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, int __count)
Left-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the se...
Definition: mmintrin.h:884
#define _m_psrlqi
Definition: mmintrin.h:1597
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:727
#define _m_punpckhbw
Definition: mmintrin.h:1559
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count)
Left-shifts each 16-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:772
#define _m_paddsb
Definition: mmintrin.h:1568
#define _m_psllqi
Definition: mmintrin.h:1587
#define _m_pslldi
Definition: mmintrin.h:1585
#define _m_pmullw
Definition: mmintrin.h:1581
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:1006
#define _m_psubsb
Definition: mmintrin.h:1575
static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m)
Returns the lower 32 bits of a 64-bit integer vector as a 32-bit signed integer.
Definition: mmintrin.h:106
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1283
#define _m_pcmpgtw
Definition: mmintrin.h:1606
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:958
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count)
Right-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit ...
Definition: mmintrin.h:1097
#define _m_pcmpeqb
Definition: mmintrin.h:1602
long long __m64 __attribute__((__vector_size__(8), __aligned__(8)))
Definition: mmintrin.h:17
#define _m_psrldi
Definition: mmintrin.h:1595
#define _m_from_int
Definition: mmintrin.h:1552
#define _m_paddd
Definition: mmintrin.h:1567
#define _m_psubw
Definition: mmintrin.h:1573
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, __m64 __m2)
Subtracts, with saturation, each 8-bit signed integer element of the second 64-bit integer vector of ...
Definition: mmintrin.h:606
#define _m_psrawi
Definition: mmintrin.h:1589
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:749
#define _m_psubb
Definition: mmintrin.h:1572
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:243
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setzero_si64(void)
Constructs a 64-bit integer vector initialized to zero.
Definition: mmintrin.h:1342
#define _m_from_int64
Definition: mmintrin.h:1553
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, __m64 __m2)
Converts, with saturation, 16-bit signed integers from both 64-bit integer vector parameters of [4 x ...
Definition: mmintrin.h:215
#define _m_psubsw
Definition: mmintrin.h:1576
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, __m64 __m2)
Multiplies each 16-bit signed integer element of the first 64-bit integer vector of [4 x i16] by the ...
Definition: mmintrin.h:705
#define _m_punpcklwd
Definition: mmintrin.h:1563
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count)
Right-shifts each 32-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:1053
#define _m_pxor
Definition: mmintrin.h:1601
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, __m64 __m2)
Performs a bitwise exclusive OR of two 64-bit integer vectors.
Definition: mmintrin.h:1195
#define _m_packssdw
Definition: mmintrin.h:1557
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i)
Casts a 64-bit signed integer value into a 64-bit integer vector.
Definition: mmintrin.h:122
#define _m_pmulhw
Definition: mmintrin.h:1580
#define _m_psrld
Definition: mmintrin.h:1594
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
Definition: mmintrin.h:1217
#define _m_paddw
Definition: mmintrin.h:1566
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count)
Left-shifts the first 64-bit integer parameter by the number of bits specified by the second 64-bit i...
Definition: mmintrin.h:863
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:289
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, __m64 __m2)
Subtracts each 32-bit integer element of the second 64-bit integer vector of [2 x i32] from the corre...
Definition: mmintrin.h:582
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
Unpacks the upper 32 bits from two 64-bit integer vectors of [4 x i16] and interleaves them into a 64...
Definition: mmintrin.h:267
#define _m_psraw
Definition: mmintrin.h:1588
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, int __count)
Right-shifts the first parameter, which is a 64-bit integer, by the number of bits specified by the s...
Definition: mmintrin.h:1119
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set_pi32(int __i1, int __i0)
Constructs a 64-bit integer vector initialized with the specified 32-bit integer values.
Definition: mmintrin.h:1362
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] and interleaves them into a 64-...
Definition: mmintrin.h:316
#define _m_psubd
Definition: mmintrin.h:1574
#define _m_paddsw
Definition: mmintrin.h:1569
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set1_pi16(short __w)
Constructs a 64-bit integer vector of [4 x i16], with each of the 16-bit integer vector elements set ...
Definition: mmintrin.h:1452
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count)
Right-shifts each 16-bit integer element of the first parameter, which is a 64-bit integer vector of ...
Definition: mmintrin.h:909
#define _m_psrlq
Definition: mmintrin.h:1596
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, __m64 __m2)
Subtracts, with saturation, each 16-bit signed integer element of the second 64-bit integer vector of...
Definition: mmintrin.h:630
#define _m_psubusw
Definition: mmintrin.h:1578
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, int __count)
Left-shifts each 32-bit signed integer element of a 64-bit integer vector of [2 x i32] by the number ...
Definition: mmintrin.h:842
#define _m_pandn
Definition: mmintrin.h:1599
#define _m_psradi
Definition: mmintrin.h:1591
#define _m_paddusb
Definition: mmintrin.h:1570
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1239
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, int __count)
Left-shifts each 16-bit signed integer element of a 64-bit integer vector of [4 x i16] by the number ...
Definition: mmintrin.h:795
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count)
Left-shifts each 32-bit signed integer element of the first parameter, which is a 64-bit integer vect...
Definition: mmintrin.h:819
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit unsigned integer element of the second 64-bit integer vector of [4 x i16] from ...
Definition: mmintrin.h:678
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, __m64 __m2)
Subtracts each 16-bit integer element of the second 64-bit integer vector of [4 x i16] from the corre...
Definition: mmintrin.h:561
#define _m_pmaddwd
Definition: mmintrin.h:1579
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Constructs a 64-bit integer vector initialized with the specified 8-bit integer values.
Definition: mmintrin.h:1414
#define _m_psrad
Definition: mmintrin.h:1590
#define _m_punpcklbw
Definition: mmintrin.h:1562
#define _m_punpckldq
Definition: mmintrin.h:1564
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
Compares the 32-bit integer elements of two 64-bit integer vectors of [2 x i32] to determine if the e...
Definition: mmintrin.h:1329
#define _m_psrlw
Definition: mmintrin.h:1592
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
Unpacks the lower 32 bits from two 64-bit integer vectors of [2 x i32] and interleaves them into a 64...
Definition: mmintrin.h:362
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
Compares the 16-bit integer elements of two 64-bit integer vectors of [4 x i16] to determine if the e...
Definition: mmintrin.h:1307
#define as_int(x)
#define as_float(x)
#define as_char(x)
OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators Reinterprets a data type as another data type of the...
#define as_short(x)