clang 22.0.0git
smmintrin.h
Go to the documentation of this file.
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <tmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
20#define __DEFAULT_FN_ATTRS \
21 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
22 __min_vector_width__(128)))
23
24#if defined(__cplusplus) && (__cplusplus >= 201103L)
25#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
26#else
27#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
28#endif
29
30/* SSE4 Rounding macros. */
31#define _MM_FROUND_TO_NEAREST_INT 0x00
32#define _MM_FROUND_TO_NEG_INF 0x01
33#define _MM_FROUND_TO_POS_INF 0x02
34#define _MM_FROUND_TO_ZERO 0x03
35#define _MM_FROUND_CUR_DIRECTION 0x04
36
37#define _MM_FROUND_RAISE_EXC 0x00
38#define _MM_FROUND_NO_EXC 0x08
39
40#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
41#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
42#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
43#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
44#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
45#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
46
47/// Rounds up each element of the 128-bit vector of [4 x float] to an
48/// integer and returns the rounded values in a 128-bit vector of
49/// [4 x float].
50///
51/// \headerfile <x86intrin.h>
52///
53/// \code
54/// __m128 _mm_ceil_ps(__m128 X);
55/// \endcode
56///
57/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
58///
59/// \param X
60/// A 128-bit vector of [4 x float] values to be rounded up.
61/// \returns A 128-bit vector of [4 x float] containing the rounded values.
62#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
63
64/// Rounds up each element of the 128-bit vector of [2 x double] to an
65/// integer and returns the rounded values in a 128-bit vector of
66/// [2 x double].
67///
68/// \headerfile <x86intrin.h>
69///
70/// \code
71/// __m128d _mm_ceil_pd(__m128d X);
72/// \endcode
73///
74/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
75///
76/// \param X
77/// A 128-bit vector of [2 x double] values to be rounded up.
78/// \returns A 128-bit vector of [2 x double] containing the rounded values.
79#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
80
81/// Copies three upper elements of the first 128-bit vector operand to
82/// the corresponding three upper elements of the 128-bit result vector of
83/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
84/// operand to an integer and copies it to the lowest element of the 128-bit
85/// result vector of [4 x float].
86///
87/// \headerfile <x86intrin.h>
88///
89/// \code
90/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
91/// \endcode
92///
93/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
94///
95/// \param X
96/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
97/// copied to the corresponding bits of the result.
98/// \param Y
99/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
100/// rounded up to the nearest integer and copied to the corresponding bits
101/// of the result.
102/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
103/// values.
104#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
105
106/// Copies the upper element of the first 128-bit vector operand to the
107/// corresponding upper element of the 128-bit result vector of [2 x double].
108/// Rounds up the lower element of the second 128-bit vector operand to an
109/// integer and copies it to the lower element of the 128-bit result vector
110/// of [2 x double].
111///
112/// \headerfile <x86intrin.h>
113///
114/// \code
115/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
116/// \endcode
117///
118/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
119///
120/// \param X
121/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
122/// copied to the corresponding bits of the result.
123/// \param Y
124/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
125/// rounded up to the nearest integer and copied to the corresponding bits
126/// of the result.
127/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
128/// values.
129#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
130
131/// Rounds down each element of the 128-bit vector of [4 x float] to an
132/// an integer and returns the rounded values in a 128-bit vector of
133/// [4 x float].
134///
135/// \headerfile <x86intrin.h>
136///
137/// \code
138/// __m128 _mm_floor_ps(__m128 X);
139/// \endcode
140///
141/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
142///
143/// \param X
144/// A 128-bit vector of [4 x float] values to be rounded down.
145/// \returns A 128-bit vector of [4 x float] containing the rounded values.
146#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
147
148/// Rounds down each element of the 128-bit vector of [2 x double] to an
149/// integer and returns the rounded values in a 128-bit vector of
150/// [2 x double].
151///
152/// \headerfile <x86intrin.h>
153///
154/// \code
155/// __m128d _mm_floor_pd(__m128d X);
156/// \endcode
157///
158/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
159///
160/// \param X
161/// A 128-bit vector of [2 x double].
162/// \returns A 128-bit vector of [2 x double] containing the rounded values.
163#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
164
165/// Copies three upper elements of the first 128-bit vector operand to
166/// the corresponding three upper elements of the 128-bit result vector of
167/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
168/// operand to an integer and copies it to the lowest element of the 128-bit
169/// result vector of [4 x float].
170///
171/// \headerfile <x86intrin.h>
172///
173/// \code
174/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
175/// \endcode
176///
177/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
178///
179/// \param X
180/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
181/// copied to the corresponding bits of the result.
182/// \param Y
183/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
184/// rounded down to the nearest integer and copied to the corresponding bits
185/// of the result.
186/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
187/// values.
188#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
189
190/// Copies the upper element of the first 128-bit vector operand to the
191/// corresponding upper element of the 128-bit result vector of [2 x double].
192/// Rounds down the lower element of the second 128-bit vector operand to an
193/// integer and copies it to the lower element of the 128-bit result vector
194/// of [2 x double].
195///
196/// \headerfile <x86intrin.h>
197///
198/// \code
199/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
200/// \endcode
201///
202/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
203///
204/// \param X
205/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
206/// copied to the corresponding bits of the result.
207/// \param Y
208/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
209/// rounded down to the nearest integer and copied to the corresponding bits
210/// of the result.
211/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
212/// values.
213#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
214
215/// Rounds each element of the 128-bit vector of [4 x float] to an
216/// integer value according to the rounding control specified by the second
217/// argument and returns the rounded values in a 128-bit vector of
218/// [4 x float].
219///
220/// \headerfile <x86intrin.h>
221///
222/// \code
223/// __m128 _mm_round_ps(__m128 X, const int M);
224/// \endcode
225///
226/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
227///
228/// \param X
229/// A 128-bit vector of [4 x float].
230/// \param M
231/// An integer value that specifies the rounding operation. \n
232/// Bits [7:4] are reserved. \n
233/// Bit [3] is a precision exception value: \n
234/// 0: A normal PE exception is used \n
235/// 1: The PE field is not updated \n
236/// Bit [2] is the rounding control source: \n
237/// 0: Use bits [1:0] of \a M \n
238/// 1: Use the current MXCSR setting \n
239/// Bits [1:0] contain the rounding control definition: \n
240/// 00: Nearest \n
241/// 01: Downward (toward negative infinity) \n
242/// 10: Upward (toward positive infinity) \n
243/// 11: Truncated
244/// \returns A 128-bit vector of [4 x float] containing the rounded values.
245#define _mm_round_ps(X, M) \
246 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
247
248/// Copies three upper elements of the first 128-bit vector operand to
249/// the corresponding three upper elements of the 128-bit result vector of
250/// [4 x float]. Rounds the lowest element of the second 128-bit vector
251/// operand to an integer value according to the rounding control specified
252/// by the third argument and copies it to the lowest element of the 128-bit
253/// result vector of [4 x float].
254///
255/// \headerfile <x86intrin.h>
256///
257/// \code
258/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
259/// \endcode
260///
261/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
262///
263/// \param X
264/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
265/// copied to the corresponding bits of the result.
266/// \param Y
267/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
268/// rounded to the nearest integer using the specified rounding control and
269/// copied to the corresponding bits of the result.
270/// \param M
271/// An integer value that specifies the rounding operation. \n
272/// Bits [7:4] are reserved. \n
273/// Bit [3] is a precision exception value: \n
274/// 0: A normal PE exception is used \n
275/// 1: The PE field is not updated \n
276/// Bit [2] is the rounding control source: \n
277/// 0: Use bits [1:0] of \a M \n
278/// 1: Use the current MXCSR setting \n
279/// Bits [1:0] contain the rounding control definition: \n
280/// 00: Nearest \n
281/// 01: Downward (toward negative infinity) \n
282/// 10: Upward (toward positive infinity) \n
283/// 11: Truncated
284/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
285/// values.
286#define _mm_round_ss(X, Y, M) \
287 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
288 (M)))
289
290/// Rounds each element of the 128-bit vector of [2 x double] to an
291/// integer value according to the rounding control specified by the second
292/// argument and returns the rounded values in a 128-bit vector of
293/// [2 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// \code
298/// __m128d _mm_round_pd(__m128d X, const int M);
299/// \endcode
300///
301/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
302///
303/// \param X
304/// A 128-bit vector of [2 x double].
305/// \param M
306/// An integer value that specifies the rounding operation. \n
307/// Bits [7:4] are reserved. \n
308/// Bit [3] is a precision exception value: \n
309/// 0: A normal PE exception is used \n
310/// 1: The PE field is not updated \n
311/// Bit [2] is the rounding control source: \n
312/// 0: Use bits [1:0] of \a M \n
313/// 1: Use the current MXCSR setting \n
314/// Bits [1:0] contain the rounding control definition: \n
315/// 00: Nearest \n
316/// 01: Downward (toward negative infinity) \n
317/// 10: Upward (toward positive infinity) \n
318/// 11: Truncated
319/// \returns A 128-bit vector of [2 x double] containing the rounded values.
320#define _mm_round_pd(X, M) \
321 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
322
323/// Copies the upper element of the first 128-bit vector operand to the
324/// corresponding upper element of the 128-bit result vector of [2 x double].
325/// Rounds the lower element of the second 128-bit vector operand to an
326/// integer value according to the rounding control specified by the third
327/// argument and copies it to the lower element of the 128-bit result vector
328/// of [2 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// \code
333/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
334/// \endcode
335///
336/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
337///
338/// \param X
339/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
340/// copied to the corresponding bits of the result.
341/// \param Y
342/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
343/// rounded to the nearest integer using the specified rounding control and
344/// copied to the corresponding bits of the result.
345/// \param M
346/// An integer value that specifies the rounding operation. \n
347/// Bits [7:4] are reserved. \n
348/// Bit [3] is a precision exception value: \n
349/// 0: A normal PE exception is used \n
350/// 1: The PE field is not updated \n
351/// Bit [2] is the rounding control source: \n
352/// 0: Use bits [1:0] of \a M \n
353/// 1: Use the current MXCSR setting \n
354/// Bits [1:0] contain the rounding control definition: \n
355/// 00: Nearest \n
356/// 01: Downward (toward negative infinity) \n
357/// 10: Upward (toward positive infinity) \n
358/// 11: Truncated
359/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
360/// values.
361#define _mm_round_sd(X, Y, M) \
362 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
363 (M)))
364
365/* SSE4 Packed Blending Intrinsics. */
366/// Returns a 128-bit vector of [2 x double] where the values are
367/// selected from either the first or second operand as specified by the
368/// third operand, the control mask.
369///
370/// \headerfile <x86intrin.h>
371///
372/// \code
373/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
374/// \endcode
375///
376/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
377///
378/// \param V1
379/// A 128-bit vector of [2 x double].
380/// \param V2
381/// A 128-bit vector of [2 x double].
382/// \param M
383/// An immediate integer operand, with mask bits [1:0] specifying how the
384/// values are to be copied. The position of the mask bit corresponds to the
385/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
386/// element in operand \a V1 is copied to the same position in the result.
387/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
388/// is copied to the same position in the result.
389/// \returns A 128-bit vector of [2 x double] containing the copied values.
390#define _mm_blend_pd(V1, V2, M) \
391 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
392 (__v2df)(__m128d)(V2), (int)(M)))
393
394/// Returns a 128-bit vector of [4 x float] where the values are selected
395/// from either the first or second operand as specified by the third
396/// operand, the control mask.
397///
398/// \headerfile <x86intrin.h>
399///
400/// \code
401/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
402/// \endcode
403///
404/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
405///
406/// \param V1
407/// A 128-bit vector of [4 x float].
408/// \param V2
409/// A 128-bit vector of [4 x float].
410/// \param M
411/// An immediate integer operand, with mask bits [3:0] specifying how the
412/// values are to be copied. The position of the mask bit corresponds to the
413/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
414/// element in operand \a V1 is copied to the same position in the result.
415/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
416/// is copied to the same position in the result.
417/// \returns A 128-bit vector of [4 x float] containing the copied values.
418#define _mm_blend_ps(V1, V2, M) \
419 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
420 (int)(M)))
421
422/// Returns a 128-bit vector of [2 x double] where the values are
423/// selected from either the first or second operand as specified by the
424/// third operand, the control mask.
425///
426/// \headerfile <x86intrin.h>
427///
428/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
429///
430/// \param __V1
431/// A 128-bit vector of [2 x double].
432/// \param __V2
433/// A 128-bit vector of [2 x double].
434/// \param __M
435/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
436/// values are to be copied. The position of the mask bit corresponds to the
437/// most significant bit of a copied value. When a mask bit is 0, the
438/// corresponding 64-bit element in operand \a __V1 is copied to the same
439/// position in the result. When a mask bit is 1, the corresponding 64-bit
440/// element in operand \a __V2 is copied to the same position in the result.
441/// \returns A 128-bit vector of [2 x double] containing the copied values.
442static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
443_mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M) {
444 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
445 (__v2df)__M);
446}
447
448/// Returns a 128-bit vector of [4 x float] where the values are
449/// selected from either the first or second operand as specified by the
450/// third operand, the control mask.
451///
452/// \headerfile <x86intrin.h>
453///
454/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
455///
456/// \param __V1
457/// A 128-bit vector of [4 x float].
458/// \param __V2
459/// A 128-bit vector of [4 x float].
460/// \param __M
461/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
462/// how the values are to be copied. The position of the mask bit corresponds
463/// to the most significant bit of a copied value. When a mask bit is 0, the
464/// corresponding 32-bit element in operand \a __V1 is copied to the same
465/// position in the result. When a mask bit is 1, the corresponding 32-bit
466/// element in operand \a __V2 is copied to the same position in the result.
467/// \returns A 128-bit vector of [4 x float] containing the copied values.
468static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
469_mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M) {
470 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
471 (__v4sf)__M);
472}
473
474/// Returns a 128-bit vector of [16 x i8] where the values are selected
475/// from either of the first or second operand as specified by the third
476/// operand, the control mask.
477///
478/// \headerfile <x86intrin.h>
479///
480/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
481///
482/// \param __V1
483/// A 128-bit vector of [16 x i8].
484/// \param __V2
485/// A 128-bit vector of [16 x i8].
486/// \param __M
487/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
488/// how the values are to be copied. The position of the mask bit corresponds
489/// to the most significant bit of a copied value. When a mask bit is 0, the
490/// corresponding 8-bit element in operand \a __V1 is copied to the same
491/// position in the result. When a mask bit is 1, the corresponding 8-bit
492/// element in operand \a __V2 is copied to the same position in the result.
493/// \returns A 128-bit vector of [16 x i8] containing the copied values.
494static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
495_mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M) {
496 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
497 (__v16qi)__M);
498}
499
500/// Returns a 128-bit vector of [8 x i16] where the values are selected
501/// from either of the first or second operand as specified by the third
502/// operand, the control mask.
503///
504/// \headerfile <x86intrin.h>
505///
506/// \code
507/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
508/// \endcode
509///
510/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
511///
512/// \param V1
513/// A 128-bit vector of [8 x i16].
514/// \param V2
515/// A 128-bit vector of [8 x i16].
516/// \param M
517/// An immediate integer operand, with mask bits [7:0] specifying how the
518/// values are to be copied. The position of the mask bit corresponds to the
519/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
520/// element in operand \a V1 is copied to the same position in the result.
521/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
522/// is copied to the same position in the result.
523/// \returns A 128-bit vector of [8 x i16] containing the copied values.
524#define _mm_blend_epi16(V1, V2, M) \
525 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
526 (__v8hi)(__m128i)(V2), (int)(M)))
527
528/* SSE4 Dword Multiply Instructions. */
529/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
530/// and returns the lower 32 bits of the each product in a 128-bit vector of
531/// [4 x i32].
532///
533/// \headerfile <x86intrin.h>
534///
535/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
536///
537/// \param __V1
538/// A 128-bit integer vector.
539/// \param __V2
540/// A 128-bit integer vector.
541/// \returns A 128-bit integer vector containing the products of both operands.
542static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
543_mm_mullo_epi32(__m128i __V1, __m128i __V2) {
544 return (__m128i)((__v4su)__V1 * (__v4su)__V2);
545}
546
547/// Multiplies corresponding even-indexed elements of two 128-bit
548/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
549/// containing the products.
550///
551/// \headerfile <x86intrin.h>
552///
553/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
554///
555/// \param __V1
556/// A 128-bit vector of [4 x i32].
557/// \param __V2
558/// A 128-bit vector of [4 x i32].
559/// \returns A 128-bit vector of [2 x i64] containing the products of both
560/// operands.
561static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
562_mm_mul_epi32(__m128i __V1, __m128i __V2) {
563 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
564}
565
566/* SSE4 Floating Point Dot Product Instructions. */
567/// Computes the dot product of the two 128-bit vectors of [4 x float]
568/// and returns it in the elements of the 128-bit result vector of
569/// [4 x float].
570///
571/// The immediate integer operand controls which input elements
572/// will contribute to the dot product, and where the final results are
573/// returned.
574///
575/// \headerfile <x86intrin.h>
576///
577/// \code
578/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
579/// \endcode
580///
581/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
582///
583/// \param X
584/// A 128-bit vector of [4 x float].
585/// \param Y
586/// A 128-bit vector of [4 x float].
587/// \param M
588/// An immediate integer operand. Mask bits [7:4] determine which elements
589/// of the input vectors are used, with bit [4] corresponding to the lowest
590/// element and bit [7] corresponding to the highest element of each [4 x
591/// float] vector. If a bit is set, the corresponding elements from the two
592/// input vectors are used as an input for dot product; otherwise that input
593/// is treated as zero. Bits [3:0] determine which elements of the result
594/// will receive a copy of the final dot product, with bit [0] corresponding
595/// to the lowest element and bit [3] corresponding to the highest element of
596/// each [4 x float] subvector. If a bit is set, the dot product is returned
597/// in the corresponding element; otherwise that element is set to zero.
598/// \returns A 128-bit vector of [4 x float] containing the dot product.
599#define _mm_dp_ps(X, Y, M) \
600 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
601
602/// Computes the dot product of the two 128-bit vectors of [2 x double]
603/// and returns it in the elements of the 128-bit result vector of
604/// [2 x double].
605///
606/// The immediate integer operand controls which input
607/// elements will contribute to the dot product, and where the final results
608/// are returned.
609///
610/// \headerfile <x86intrin.h>
611///
612/// \code
613/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
614/// \endcode
615///
616/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
617///
618/// \param X
619/// A 128-bit vector of [2 x double].
620/// \param Y
621/// A 128-bit vector of [2 x double].
622/// \param M
623/// An immediate integer operand. Mask bits [5:4] determine which elements
624/// of the input vectors are used, with bit [4] corresponding to the lowest
625/// element and bit [5] corresponding to the highest element of each of [2 x
626/// double] vector. If a bit is set, the corresponding elements from the two
627/// input vectors are used as an input for dot product; otherwise that input
628/// is treated as zero. Bits [1:0] determine which elements of the result
629/// will receive a copy of the final dot product, with bit [0] corresponding
630/// to the lowest element and bit [1] corresponding to the highest element of
631/// each [2 x double] vector. If a bit is set, the dot product is returned in
632/// the corresponding element; otherwise that element is set to zero.
633#define _mm_dp_pd(X, Y, M) \
634 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
635 (M)))
636
637/* SSE4 Streaming Load Hint Instruction. */
638/// Loads integer values from a 128-bit aligned memory location to a
639/// 128-bit integer vector.
640///
641/// \headerfile <x86intrin.h>
642///
643/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
644///
645/// \param __V
646/// A pointer to a 128-bit aligned memory location that contains the integer
647/// values.
648/// \returns A 128-bit integer vector containing the data stored at the
649/// specified memory location.
650static __inline__ __m128i __DEFAULT_FN_ATTRS
651_mm_stream_load_si128(const void *__V) {
652 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
653}
654
655/* SSE4 Packed Integer Min/Max Instructions. */
656/// Compares the corresponding elements of two 128-bit vectors of
657/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
658/// of the two values.
659///
660/// \headerfile <x86intrin.h>
661///
662/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
663///
664/// \param __V1
665/// A 128-bit vector of [16 x i8].
666/// \param __V2
667/// A 128-bit vector of [16 x i8]
668/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
669static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
670_mm_min_epi8(__m128i __V1, __m128i __V2) {
671 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
672}
673
674/// Compares the corresponding elements of two 128-bit vectors of
675/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
676/// greater value of the two.
677///
678/// \headerfile <x86intrin.h>
679///
680/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
681///
682/// \param __V1
683/// A 128-bit vector of [16 x i8].
684/// \param __V2
685/// A 128-bit vector of [16 x i8].
686/// \returns A 128-bit vector of [16 x i8] containing the greater values.
687static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
688_mm_max_epi8(__m128i __V1, __m128i __V2) {
689 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
690}
691
692/// Compares the corresponding elements of two 128-bit vectors of
693/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
694/// value of the two.
695///
696/// \headerfile <x86intrin.h>
697///
698/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
699///
700/// \param __V1
701/// A 128-bit vector of [8 x u16].
702/// \param __V2
703/// A 128-bit vector of [8 x u16].
704/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
705static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
706_mm_min_epu16(__m128i __V1, __m128i __V2) {
707 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
708}
709
710/// Compares the corresponding elements of two 128-bit vectors of
711/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
712/// greater value of the two.
713///
714/// \headerfile <x86intrin.h>
715///
716/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
717///
718/// \param __V1
719/// A 128-bit vector of [8 x u16].
720/// \param __V2
721/// A 128-bit vector of [8 x u16].
722/// \returns A 128-bit vector of [8 x u16] containing the greater values.
723static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
724_mm_max_epu16(__m128i __V1, __m128i __V2) {
725 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
726}
727
728/// Compares the corresponding elements of two 128-bit vectors of
729/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
730/// value of the two.
731///
732/// \headerfile <x86intrin.h>
733///
734/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
735///
736/// \param __V1
737/// A 128-bit vector of [4 x i32].
738/// \param __V2
739/// A 128-bit vector of [4 x i32].
740/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
741static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
742_mm_min_epi32(__m128i __V1, __m128i __V2) {
743 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
744}
745
746/// Compares the corresponding elements of two 128-bit vectors of
747/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
748/// greater value of the two.
749///
750/// \headerfile <x86intrin.h>
751///
752/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
753///
754/// \param __V1
755/// A 128-bit vector of [4 x i32].
756/// \param __V2
757/// A 128-bit vector of [4 x i32].
758/// \returns A 128-bit vector of [4 x i32] containing the greater values.
759static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
760_mm_max_epi32(__m128i __V1, __m128i __V2) {
761 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
762}
763
764/// Compares the corresponding elements of two 128-bit vectors of
765/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
766/// value of the two.
767///
768/// \headerfile <x86intrin.h>
769///
770/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
771///
772/// \param __V1
773/// A 128-bit vector of [4 x u32].
774/// \param __V2
775/// A 128-bit vector of [4 x u32].
776/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
777static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
778_mm_min_epu32(__m128i __V1, __m128i __V2) {
779 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
780}
781
782/// Compares the corresponding elements of two 128-bit vectors of
783/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
784/// greater value of the two.
785///
786/// \headerfile <x86intrin.h>
787///
788/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
789///
790/// \param __V1
791/// A 128-bit vector of [4 x u32].
792/// \param __V2
793/// A 128-bit vector of [4 x u32].
794/// \returns A 128-bit vector of [4 x u32] containing the greater values.
795static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
796_mm_max_epu32(__m128i __V1, __m128i __V2) {
797 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
798}
799
800/* SSE4 Insertion and Extraction from XMM Register Instructions. */
801/// Takes the first argument \a X and inserts an element from the second
802/// argument \a Y as selected by the third argument \a N. That result then
803/// has elements zeroed out also as selected by the third argument \a N. The
804/// resulting 128-bit vector of [4 x float] is then returned.
805///
806/// \headerfile <x86intrin.h>
807///
808/// \code
809/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
810/// \endcode
811///
812/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
813///
814/// \param X
815/// A 128-bit vector source operand of [4 x float]. With the exception of
816/// those bits in the result copied from parameter \a Y and zeroed by bits
817/// [3:0] of \a N, all bits from this parameter are copied to the result.
818/// \param Y
819/// A 128-bit vector source operand of [4 x float]. One single-precision
820/// floating-point element from this source, as determined by the immediate
821/// parameter, is copied to the result.
822/// \param N
823/// Specifies which bits from operand \a Y will be copied, which bits in the
824/// result they will be copied to, and which bits in the result will be
825/// cleared. The following assignments are made: \n
826/// Bits [7:6] specify the bits to copy from operand \a Y: \n
827/// 00: Selects bits [31:0] from operand \a Y. \n
828/// 01: Selects bits [63:32] from operand \a Y. \n
829/// 10: Selects bits [95:64] from operand \a Y. \n
830/// 11: Selects bits [127:96] from operand \a Y. \n
831/// Bits [5:4] specify the bits in the result to which the selected bits
832/// from operand \a Y are copied: \n
833/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
834/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
835/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
836/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
837/// Bits[3:0]: If any of these bits are set, the corresponding result
838/// element is cleared.
839/// \returns A 128-bit vector of [4 x float] containing the copied
840/// single-precision floating point elements from the operands.
841#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
842
843/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
844/// returns it, using the immediate value parameter \a N as a selector.
845///
846/// \headerfile <x86intrin.h>
847///
848/// \code
849/// int _mm_extract_ps(__m128 X, const int N);
850/// \endcode
851///
852/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
853/// instruction.
854///
855/// \param X
856/// A 128-bit vector of [4 x float].
857/// \param N
858/// An immediate value. Bits [1:0] determines which bits from the argument
859/// \a X are extracted and returned: \n
860/// 00: Bits [31:0] of parameter \a X are returned. \n
861/// 01: Bits [63:32] of parameter \a X are returned. \n
862/// 10: Bits [95:64] of parameter \a X are returned. \n
863/// 11: Bits [127:96] of parameter \a X are returned.
864/// \returns A 32-bit integer containing the extracted 32 bits of float data.
865#define _mm_extract_ps(X, N) \
866 __builtin_bit_cast( \
867 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
868
869/* Miscellaneous insert and extract macros. */
870/* Extract a single-precision float from X at index N into D. */
871#define _MM_EXTRACT_FLOAT(D, X, N) \
872 do { \
873 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
874 } while (0)
875
876/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
877 an index suitable for _mm_insert_ps. */
878#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
879
880/* Extract a float from X at index N into the first index of the return. */
881#define _MM_PICK_OUT_PS(X, N) \
882 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
883
884/* Insert int into packed integer array at index. */
885/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
886/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
887/// of an integer parameter \a I into an offset specified by the immediate
888/// value parameter \a N.
889///
890/// \headerfile <x86intrin.h>
891///
892/// \code
893/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
894/// \endcode
895///
896/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
897///
898/// \param X
899/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
900/// result and then one of the sixteen elements in the result vector is
901/// replaced by the lower 8 bits of \a I.
902/// \param I
903/// An integer. The lower 8 bits of this operand are written to the result
904/// beginning at the offset specified by \a N.
905/// \param N
906/// An immediate value. Bits [3:0] specify the bit offset in the result at
907/// which the lower 8 bits of \a I are written. \n
908/// 0000: Bits [7:0] of the result are used for insertion. \n
909/// 0001: Bits [15:8] of the result are used for insertion. \n
910/// 0010: Bits [23:16] of the result are used for insertion. \n
911/// 0011: Bits [31:24] of the result are used for insertion. \n
912/// 0100: Bits [39:32] of the result are used for insertion. \n
913/// 0101: Bits [47:40] of the result are used for insertion. \n
914/// 0110: Bits [55:48] of the result are used for insertion. \n
915/// 0111: Bits [63:56] of the result are used for insertion. \n
916/// 1000: Bits [71:64] of the result are used for insertion. \n
917/// 1001: Bits [79:72] of the result are used for insertion. \n
918/// 1010: Bits [87:80] of the result are used for insertion. \n
919/// 1011: Bits [95:88] of the result are used for insertion. \n
920/// 1100: Bits [103:96] of the result are used for insertion. \n
921/// 1101: Bits [111:104] of the result are used for insertion. \n
922/// 1110: Bits [119:112] of the result are used for insertion. \n
923/// 1111: Bits [127:120] of the result are used for insertion.
924/// \returns A 128-bit integer vector containing the constructed values.
925#define _mm_insert_epi8(X, I, N) \
926 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
927 (int)(N)))
928
929/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
930/// the 128-bit integer vector parameter, and then inserting the 32-bit
931/// integer parameter \a I at the offset specified by the immediate value
932/// parameter \a N.
933///
934/// \headerfile <x86intrin.h>
935///
936/// \code
937/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
938/// \endcode
939///
940/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
941///
942/// \param X
943/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
944/// result and then one of the four elements in the result vector is
945/// replaced by \a I.
946/// \param I
947/// A 32-bit integer that is written to the result beginning at the offset
948/// specified by \a N.
949/// \param N
950/// An immediate value. Bits [1:0] specify the bit offset in the result at
951/// which the integer \a I is written. \n
952/// 00: Bits [31:0] of the result are used for insertion. \n
953/// 01: Bits [63:32] of the result are used for insertion. \n
954/// 10: Bits [95:64] of the result are used for insertion. \n
955/// 11: Bits [127:96] of the result are used for insertion.
956/// \returns A 128-bit integer vector containing the constructed values.
957#define _mm_insert_epi32(X, I, N) \
958 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
959 (int)(N)))
960
961#ifdef __x86_64__
962/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
963/// the 128-bit integer vector parameter, and then inserting the 64-bit
964/// integer parameter \a I, using the immediate value parameter \a N as an
965/// insertion location selector.
966///
967/// \headerfile <x86intrin.h>
968///
969/// \code
970/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
971/// \endcode
972///
973/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
974///
975/// \param X
976/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
977/// result and then one of the two elements in the result vector is replaced
978/// by \a I.
979/// \param I
980/// A 64-bit integer that is written to the result beginning at the offset
981/// specified by \a N.
982/// \param N
983/// An immediate value. Bit [0] specifies the bit offset in the result at
984/// which the integer \a I is written. \n
985/// 0: Bits [63:0] of the result are used for insertion. \n
986/// 1: Bits [127:64] of the result are used for insertion. \n
987/// \returns A 128-bit integer vector containing the constructed values.
988#define _mm_insert_epi64(X, I, N) \
989 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
990 (int)(N)))
991#endif /* __x86_64__ */
992
993/* Extract int from packed integer array at index. This returns the element
994 * as a zero extended value, so it is unsigned.
995 */
996/// Extracts an 8-bit element from the 128-bit integer vector of
997/// [16 x i8], using the immediate value parameter \a N as a selector.
998///
999/// \headerfile <x86intrin.h>
1000///
1001/// \code
1002/// int _mm_extract_epi8(__m128i X, const int N);
1003/// \endcode
1004///
1005/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1006///
1007/// \param X
1008/// A 128-bit integer vector.
1009/// \param N
1010/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1011/// the argument \a X to extract and copy to the result. \n
1012/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1013/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1014/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1015/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1016/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1017/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1018/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1019/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1020/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1021/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1022/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1023/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1024/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1025/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1026/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1027/// 1111: Bits [127:120] of the parameter \a X are extracted.
1028/// \returns An unsigned integer, whose lower 8 bits are selected from the
1029/// 128-bit integer vector parameter and the remaining bits are assigned
1030/// zeros.
1031#define _mm_extract_epi8(X, N) \
1032 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1033 (int)(N)))
1034
1035/// Extracts a 32-bit element from the 128-bit integer vector of
1036/// [4 x i32], using the immediate value parameter \a N as a selector.
1037///
1038/// \headerfile <x86intrin.h>
1039///
1040/// \code
1041/// int _mm_extract_epi32(__m128i X, const int N);
1042/// \endcode
1043///
1044/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1045///
1046/// \param X
1047/// A 128-bit integer vector.
1048/// \param N
1049/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1050/// the argument \a X to extract and copy to the result. \n
1051/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1052/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1053/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1054/// 11: Bits [127:96] of the parameter \a X are exracted.
1055/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1056/// integer vector parameter and the remaining bits are assigned zeros.
1057#define _mm_extract_epi32(X, N) \
1058 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1059
1060/// Extracts a 64-bit element from the 128-bit integer vector of
1061/// [2 x i64], using the immediate value parameter \a N as a selector.
1062///
1063/// \headerfile <x86intrin.h>
1064///
1065/// \code
1066/// long long _mm_extract_epi64(__m128i X, const int N);
1067/// \endcode
1068///
1069/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1070/// in 64-bit mode.
1071///
1072/// \param X
1073/// A 128-bit integer vector.
1074/// \param N
1075/// An immediate value. Bit [0] specifies which 64-bit vector element from
1076/// the argument \a X to return. \n
1077/// 0: Bits [63:0] are returned. \n
1078/// 1: Bits [127:64] are returned. \n
1079/// \returns A 64-bit integer.
1080#define _mm_extract_epi64(X, N) \
1081 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1082
1083/* SSE4 128-bit Packed Integer Comparisons. */
1084/// Tests whether the specified bits in a 128-bit integer vector are all
1085/// zeros.
1086///
1087/// \headerfile <x86intrin.h>
1088///
1089/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1090///
1091/// \param __M
1092/// A 128-bit integer vector containing the bits to be tested.
1093/// \param __V
1094/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1095/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1096static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1097 __m128i __V) {
1098 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1099}
1100
1101/// Tests whether the specified bits in a 128-bit integer vector are all
1102/// ones.
1103///
1104/// \headerfile <x86intrin.h>
1105///
1106/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1107///
1108/// \param __M
1109/// A 128-bit integer vector containing the bits to be tested.
1110/// \param __V
1111/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1112/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1113static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1114 __m128i __V) {
1115 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1116}
1117
1118/// Tests whether the specified bits in a 128-bit integer vector are
1119/// neither all zeros nor all ones.
1120///
1121/// \headerfile <x86intrin.h>
1122///
1123/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1124///
1125/// \param __M
1126/// A 128-bit integer vector containing the bits to be tested.
1127/// \param __V
1128/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1129/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1130/// FALSE otherwise.
1131static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1132 __m128i __V) {
1133 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1134}
1135
1136/// Tests whether the specified bits in a 128-bit integer vector are all
1137/// ones.
1138///
1139/// \headerfile <x86intrin.h>
1140///
1141/// \code
1142/// int _mm_test_all_ones(__m128i V);
1143/// \endcode
1144///
1145/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1146///
1147/// \param V
1148/// A 128-bit integer vector containing the bits to be tested.
1149/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1150/// otherwise.
1151#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1152
1153/// Tests whether the specified bits in a 128-bit integer vector are
1154/// neither all zeros nor all ones.
1155///
1156/// \headerfile <x86intrin.h>
1157///
1158/// \code
1159/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1160/// \endcode
1161///
1162/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1163///
1164/// \param M
1165/// A 128-bit integer vector containing the bits to be tested.
1166/// \param V
1167/// A 128-bit integer vector selecting which bits to test in operand \a M.
1168/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1169/// FALSE otherwise.
1170#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1171
1172/// Tests whether the specified bits in a 128-bit integer vector are all
1173/// zeros.
1174///
1175/// \headerfile <x86intrin.h>
1176///
1177/// \code
1178/// int _mm_test_all_zeros(__m128i M, __m128i V);
1179/// \endcode
1180///
1181/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1182///
1183/// \param M
1184/// A 128-bit integer vector containing the bits to be tested.
1185/// \param V
1186/// A 128-bit integer vector selecting which bits to test in operand \a M.
1187/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1188#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1189
1190/* SSE4 64-bit Packed Integer Comparisons. */
1191/// Compares each of the corresponding 64-bit values of the 128-bit
1192/// integer vectors for equality.
1193///
1194/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1195///
1196/// \headerfile <x86intrin.h>
1197///
1198/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1199///
1200/// \param __V1
1201/// A 128-bit integer vector.
1202/// \param __V2
1203/// A 128-bit integer vector.
1204/// \returns A 128-bit integer vector containing the comparison results.
1205static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1206_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) {
1207 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1208}
1209
1210/* SSE4 Packed Integer Sign-Extension. */
1211/// Sign-extends each of the lower eight 8-bit integer elements of a
1212/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1213/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1214/// are unused.
1215///
1216/// \headerfile <x86intrin.h>
1217///
1218/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1219///
1220/// \param __V
1221/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1222/// sign-extended to 16-bit values.
1223/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1224static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1225_mm_cvtepi8_epi16(__m128i __V) {
1226 /* This function always performs a signed extension, but __v16qi is a char
1227 which may be signed or unsigned, so use __v16qs. */
1228 return (__m128i) __builtin_convertvector(
1229 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1230 7),
1231 __v8hi);
1232}
1233
1234/// Sign-extends each of the lower four 8-bit integer elements of a
1235/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1236/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1237/// vector are unused.
1238///
1239/// \headerfile <x86intrin.h>
1240///
1241/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1242///
1243/// \param __V
1244/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1245/// sign-extended to 32-bit values.
1246/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1247static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1248_mm_cvtepi8_epi32(__m128i __V) {
1249 /* This function always performs a signed extension, but __v16qi is a char
1250 which may be signed or unsigned, so use __v16qs. */
1251 return (__m128i) __builtin_convertvector(
1252 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1253}
1254
1255/// Sign-extends each of the lower two 8-bit integer elements of a
1256/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1257/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1258/// vector are unused.
1259///
1260/// \headerfile <x86intrin.h>
1261///
1262/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1263///
1264/// \param __V
1265/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1266/// sign-extended to 64-bit values.
1267/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1268static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1269_mm_cvtepi8_epi64(__m128i __V) {
1270 /* This function always performs a signed extension, but __v16qi is a char
1271 which may be signed or unsigned, so use __v16qs. */
1272 return (__m128i) __builtin_convertvector(
1273 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1274}
1275
1276/// Sign-extends each of the lower four 16-bit integer elements of a
1277/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1278/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1279/// vector are unused.
1280///
1281/// \headerfile <x86intrin.h>
1282///
1283/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1284///
1285/// \param __V
1286/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1287/// sign-extended to 32-bit values.
1288/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1289static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1291 return (__m128i) __builtin_convertvector(
1292 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1293}
1294
1295/// Sign-extends each of the lower two 16-bit integer elements of a
1296/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1297/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1298/// vector are unused.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1303///
1304/// \param __V
1305/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1306/// sign-extended to 64-bit values.
1307/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1308static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1310 return (__m128i) __builtin_convertvector(
1311 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1312}
1313
1314/// Sign-extends each of the lower two 32-bit integer elements of a
1315/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1316/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1317/// are unused.
1318///
1319/// \headerfile <x86intrin.h>
1320///
1321/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1322///
1323/// \param __V
1324/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1325/// sign-extended to 64-bit values.
1326/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1327static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1329 return (__m128i) __builtin_convertvector(
1330 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1331}
1332
1333/* SSE4 Packed Integer Zero-Extension. */
1334/// Zero-extends each of the lower eight 8-bit integer elements of a
1335/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1336/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1337/// are unused.
1338///
1339/// \headerfile <x86intrin.h>
1340///
1341/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1342///
1343/// \param __V
1344/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1345/// zero-extended to 16-bit values.
1346/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1347static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1348_mm_cvtepu8_epi16(__m128i __V) {
1349 return (__m128i) __builtin_convertvector(
1350 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1351 7),
1352 __v8hi);
1353}
1354
1355/// Zero-extends each of the lower four 8-bit integer elements of a
1356/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1357/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1358/// vector are unused.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1363///
1364/// \param __V
1365/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1366/// zero-extended to 32-bit values.
1367/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1368static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1369_mm_cvtepu8_epi32(__m128i __V) {
1370 return (__m128i) __builtin_convertvector(
1371 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1372}
1373
1374/// Zero-extends each of the lower two 8-bit integer elements of a
1375/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1376/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1377/// vector are unused.
1378///
1379/// \headerfile <x86intrin.h>
1380///
1381/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1382///
1383/// \param __V
1384/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1385/// zero-extended to 64-bit values.
1386/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1387static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1388_mm_cvtepu8_epi64(__m128i __V) {
1389 return (__m128i) __builtin_convertvector(
1390 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1391}
1392
1393/// Zero-extends each of the lower four 16-bit integer elements of a
1394/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1395/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1396/// vector are unused.
1397///
1398/// \headerfile <x86intrin.h>
1399///
1400/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1401///
1402/// \param __V
1403/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1404/// zero-extended to 32-bit values.
1405/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1406static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1408 return (__m128i) __builtin_convertvector(
1409 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1410}
1411
1412/// Zero-extends each of the lower two 16-bit integer elements of a
1413/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1414/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1415/// are unused.
1416///
1417/// \headerfile <x86intrin.h>
1418///
1419/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1420///
1421/// \param __V
1422/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1423/// zero-extended to 64-bit values.
1424/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1425static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1427 return (__m128i) __builtin_convertvector(
1428 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1429}
1430
1431/// Zero-extends each of the lower two 32-bit integer elements of a
1432/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1433/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1434/// are unused.
1435///
1436/// \headerfile <x86intrin.h>
1437///
1438/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1439///
1440/// \param __V
1441/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1442/// zero-extended to 64-bit values.
1443/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1444static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1446 return (__m128i) __builtin_convertvector(
1447 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1448}
1449
1450/* SSE4 Pack with Unsigned Saturation. */
1451/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1452/// vector operands into 16-bit unsigned integers, and returns the packed
1453/// result.
1454///
1455/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1456/// 0x0000 are saturated to 0x0000.
1457///
1458/// \headerfile <x86intrin.h>
1459///
1460/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1461///
1462/// \param __V1
1463/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1464/// written to the lower 64 bits of the result.
1465/// \param __V2
1466/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1467/// written to the higher 64 bits of the result.
1468/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1469static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1470 __m128i __V2) {
1471 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1472}
1473
1474/* SSE4 Multiple Packed Sums of Absolute Difference. */
1475/// Subtracts 8-bit unsigned integer values and computes the absolute
1476/// values of the differences to the corresponding bits in the destination.
1477/// Then sums of the absolute differences are returned according to the bit
1478/// fields in the immediate operand.
1479///
1480/// \headerfile <x86intrin.h>
1481///
1482/// \code
1483/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1484/// \endcode
1485///
1486/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1487///
1488/// \param X
1489/// A 128-bit vector of [16 x i8].
1490/// \param Y
1491/// A 128-bit vector of [16 x i8].
1492/// \param M
1493/// An 8-bit immediate operand specifying how the absolute differences are to
1494/// be calculated, according to the following algorithm:
1495/// \code
1496/// // M2 represents bit 2 of the immediate operand
1497/// // M10 represents bits [1:0] of the immediate operand
1498/// i = M2 * 4;
1499/// j = M10 * 4;
1500/// for (k = 0; k < 8; k = k + 1) {
1501/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1502/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1503/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1504/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1505/// r[k] = d0 + d1 + d2 + d3;
1506/// }
1507/// \endcode
1508/// \returns A 128-bit integer vector containing the sums of the sets of
1509/// absolute differences between both operands.
1510#define _mm_mpsadbw_epu8(X, Y, M) \
1511 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1512 (__v16qi)(__m128i)(Y), (M)))
1513
1514/// Finds the minimum unsigned 16-bit element in the input 128-bit
1515/// vector of [8 x u16] and returns it and along with its index.
1516///
1517/// \headerfile <x86intrin.h>
1518///
1519/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1520/// instruction.
1521///
1522/// \param __V
1523/// A 128-bit vector of [8 x u16].
1524/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1525/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1526/// and the remaining bits are set to 0.
1527static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1528 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1529}
1530
1531/* Handle the sse4.2 definitions here. */
1532
1533/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1534 so we'll do the same. */
1535
1536#undef __DEFAULT_FN_ATTRS
1537#define __DEFAULT_FN_ATTRS \
1538 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1539
1540/* These specify the type of data that we're comparing. */
1541#define _SIDD_UBYTE_OPS 0x00
1542#define _SIDD_UWORD_OPS 0x01
1543#define _SIDD_SBYTE_OPS 0x02
1544#define _SIDD_SWORD_OPS 0x03
1545
1546/* These specify the type of comparison operation. */
1547#define _SIDD_CMP_EQUAL_ANY 0x00
1548#define _SIDD_CMP_RANGES 0x04
1549#define _SIDD_CMP_EQUAL_EACH 0x08
1550#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1551
1552/* These macros specify the polarity of the operation. */
1553#define _SIDD_POSITIVE_POLARITY 0x00
1554#define _SIDD_NEGATIVE_POLARITY 0x10
1555#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1556#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1557
1558/* These macros are used in _mm_cmpXstri() to specify the return. */
1559#define _SIDD_LEAST_SIGNIFICANT 0x00
1560#define _SIDD_MOST_SIGNIFICANT 0x40
1561
1562/* These macros are used in _mm_cmpXstri() to specify the return. */
1563#define _SIDD_BIT_MASK 0x00
1564#define _SIDD_UNIT_MASK 0x40
1565
1566/* SSE4.2 Packed Comparison Intrinsics. */
1567/// Uses the immediate operand \a M to perform a comparison of string
1568/// data with implicitly defined lengths that is contained in source operands
1569/// \a A and \a B. Returns a 128-bit integer vector representing the result
1570/// mask of the comparison.
1571///
1572/// \headerfile <x86intrin.h>
1573///
1574/// \code
1575/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1576/// \endcode
1577///
1578/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1579/// instruction.
1580///
1581/// \param A
1582/// A 128-bit integer vector containing one of the source operands to be
1583/// compared.
1584/// \param B
1585/// A 128-bit integer vector containing one of the source operands to be
1586/// compared.
1587/// \param M
1588/// An 8-bit immediate operand specifying whether the characters are bytes or
1589/// words, the type of comparison to perform, and the format of the return
1590/// value. \n
1591/// Bits [1:0]: Determine source data format. \n
1592/// 00: 16 unsigned bytes \n
1593/// 01: 8 unsigned words \n
1594/// 10: 16 signed bytes \n
1595/// 11: 8 signed words \n
1596/// Bits [3:2]: Determine comparison type and aggregation method. \n
1597/// 00: Subset: Each character in \a B is compared for equality with all
1598/// the characters in \a A. \n
1599/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1600/// basis is greater than or equal for even-indexed elements in \a A,
1601/// and less than or equal for odd-indexed elements in \a A. \n
1602/// 10: Match: Compare each pair of corresponding characters in \a A and
1603/// \a B for equality. \n
1604/// 11: Substring: Search \a B for substring matches of \a A. \n
1605/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1606/// mask of the comparison results. \n
1607/// 00: No effect. \n
1608/// 01: Negate the bit mask. \n
1609/// 10: No effect. \n
1610/// 11: Negate the bit mask only for bits with an index less than or equal
1611/// to the size of \a A or \a B. \n
1612/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1613/// bytes. \n
1614/// 0: The result is zero-extended to 16 bytes. \n
1615/// 1: The result is expanded to 16 bytes (this expansion is performed by
1616/// repeating each bit 8 or 16 times).
1617/// \returns Returns a 128-bit integer vector representing the result mask of
1618/// the comparison.
1619#define _mm_cmpistrm(A, B, M) \
1620 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1621 (__v16qi)(__m128i)(B), (int)(M)))
1622
1623/// Uses the immediate operand \a M to perform a comparison of string
1624/// data with implicitly defined lengths that is contained in source operands
1625/// \a A and \a B. Returns an integer representing the result index of the
1626/// comparison.
1627///
1628/// \headerfile <x86intrin.h>
1629///
1630/// \code
1631/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1632/// \endcode
1633///
1634/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1635/// instruction.
1636///
1637/// \param A
1638/// A 128-bit integer vector containing one of the source operands to be
1639/// compared.
1640/// \param B
1641/// A 128-bit integer vector containing one of the source operands to be
1642/// compared.
1643/// \param M
1644/// An 8-bit immediate operand specifying whether the characters are bytes or
1645/// words, the type of comparison to perform, and the format of the return
1646/// value. \n
1647/// Bits [1:0]: Determine source data format. \n
1648/// 00: 16 unsigned bytes \n
1649/// 01: 8 unsigned words \n
1650/// 10: 16 signed bytes \n
1651/// 11: 8 signed words \n
1652/// Bits [3:2]: Determine comparison type and aggregation method. \n
1653/// 00: Subset: Each character in \a B is compared for equality with all
1654/// the characters in \a A. \n
1655/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1656/// basis is greater than or equal for even-indexed elements in \a A,
1657/// and less than or equal for odd-indexed elements in \a A. \n
1658/// 10: Match: Compare each pair of corresponding characters in \a A and
1659/// \a B for equality. \n
1660/// 11: Substring: Search B for substring matches of \a A. \n
1661/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1662/// mask of the comparison results. \n
1663/// 00: No effect. \n
1664/// 01: Negate the bit mask. \n
1665/// 10: No effect. \n
1666/// 11: Negate the bit mask only for bits with an index less than or equal
1667/// to the size of \a A or \a B. \n
1668/// Bit [6]: Determines whether the index of the lowest set bit or the
1669/// highest set bit is returned. \n
1670/// 0: The index of the least significant set bit. \n
1671/// 1: The index of the most significant set bit. \n
1672/// \returns Returns an integer representing the result index of the comparison.
1673#define _mm_cmpistri(A, B, M) \
1674 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1675 (__v16qi)(__m128i)(B), (int)(M)))
1676
1677/// Uses the immediate operand \a M to perform a comparison of string
1678/// data with explicitly defined lengths that is contained in source operands
1679/// \a A and \a B. Returns a 128-bit integer vector representing the result
1680/// mask of the comparison.
1681///
1682/// \headerfile <x86intrin.h>
1683///
1684/// \code
1685/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1686/// \endcode
1687///
1688/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1689/// instruction.
1690///
1691/// \param A
1692/// A 128-bit integer vector containing one of the source operands to be
1693/// compared.
1694/// \param LA
1695/// An integer that specifies the length of the string in \a A.
1696/// \param B
1697/// A 128-bit integer vector containing one of the source operands to be
1698/// compared.
1699/// \param LB
1700/// An integer that specifies the length of the string in \a B.
1701/// \param M
1702/// An 8-bit immediate operand specifying whether the characters are bytes or
1703/// words, the type of comparison to perform, and the format of the return
1704/// value. \n
1705/// Bits [1:0]: Determine source data format. \n
1706/// 00: 16 unsigned bytes \n
1707/// 01: 8 unsigned words \n
1708/// 10: 16 signed bytes \n
1709/// 11: 8 signed words \n
1710/// Bits [3:2]: Determine comparison type and aggregation method. \n
1711/// 00: Subset: Each character in \a B is compared for equality with all
1712/// the characters in \a A. \n
1713/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1714/// basis is greater than or equal for even-indexed elements in \a A,
1715/// and less than or equal for odd-indexed elements in \a A. \n
1716/// 10: Match: Compare each pair of corresponding characters in \a A and
1717/// \a B for equality. \n
1718/// 11: Substring: Search \a B for substring matches of \a A. \n
1719/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1720/// mask of the comparison results. \n
1721/// 00: No effect. \n
1722/// 01: Negate the bit mask. \n
1723/// 10: No effect. \n
1724/// 11: Negate the bit mask only for bits with an index less than or equal
1725/// to the size of \a A or \a B. \n
1726/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1727/// bytes. \n
1728/// 0: The result is zero-extended to 16 bytes. \n
1729/// 1: The result is expanded to 16 bytes (this expansion is performed by
1730/// repeating each bit 8 or 16 times). \n
1731/// \returns Returns a 128-bit integer vector representing the result mask of
1732/// the comparison.
1733#define _mm_cmpestrm(A, LA, B, LB, M) \
1734 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1735 (__v16qi)(__m128i)(B), (int)(LB), \
1736 (int)(M)))
1737
1738/// Uses the immediate operand \a M to perform a comparison of string
1739/// data with explicitly defined lengths that is contained in source operands
1740/// \a A and \a B. Returns an integer representing the result index of the
1741/// comparison.
1742///
1743/// \headerfile <x86intrin.h>
1744///
1745/// \code
1746/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1747/// \endcode
1748///
1749/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1750/// instruction.
1751///
1752/// \param A
1753/// A 128-bit integer vector containing one of the source operands to be
1754/// compared.
1755/// \param LA
1756/// An integer that specifies the length of the string in \a A.
1757/// \param B
1758/// A 128-bit integer vector containing one of the source operands to be
1759/// compared.
1760/// \param LB
1761/// An integer that specifies the length of the string in \a B.
1762/// \param M
1763/// An 8-bit immediate operand specifying whether the characters are bytes or
1764/// words, the type of comparison to perform, and the format of the return
1765/// value. \n
1766/// Bits [1:0]: Determine source data format. \n
1767/// 00: 16 unsigned bytes \n
1768/// 01: 8 unsigned words \n
1769/// 10: 16 signed bytes \n
1770/// 11: 8 signed words \n
1771/// Bits [3:2]: Determine comparison type and aggregation method. \n
1772/// 00: Subset: Each character in \a B is compared for equality with all
1773/// the characters in \a A. \n
1774/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1775/// basis is greater than or equal for even-indexed elements in \a A,
1776/// and less than or equal for odd-indexed elements in \a A. \n
1777/// 10: Match: Compare each pair of corresponding characters in \a A and
1778/// \a B for equality. \n
1779/// 11: Substring: Search B for substring matches of \a A. \n
1780/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1781/// mask of the comparison results. \n
1782/// 00: No effect. \n
1783/// 01: Negate the bit mask. \n
1784/// 10: No effect. \n
1785/// 11: Negate the bit mask only for bits with an index less than or equal
1786/// to the size of \a A or \a B. \n
1787/// Bit [6]: Determines whether the index of the lowest set bit or the
1788/// highest set bit is returned. \n
1789/// 0: The index of the least significant set bit. \n
1790/// 1: The index of the most significant set bit. \n
1791/// \returns Returns an integer representing the result index of the comparison.
1792#define _mm_cmpestri(A, LA, B, LB, M) \
1793 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1794 (__v16qi)(__m128i)(B), (int)(LB), \
1795 (int)(M)))
1796
1797/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1798/// Uses the immediate operand \a M to perform a comparison of string
1799/// data with implicitly defined lengths that is contained in source operands
1800/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1801/// string in \a B is the maximum, otherwise, returns 0.
1802///
1803/// \headerfile <x86intrin.h>
1804///
1805/// \code
1806/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1807/// \endcode
1808///
1809/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1810/// instruction.
1811///
1812/// \param A
1813/// A 128-bit integer vector containing one of the source operands to be
1814/// compared.
1815/// \param B
1816/// A 128-bit integer vector containing one of the source operands to be
1817/// compared.
1818/// \param M
1819/// An 8-bit immediate operand specifying whether the characters are bytes or
1820/// words and the type of comparison to perform. \n
1821/// Bits [1:0]: Determine source data format. \n
1822/// 00: 16 unsigned bytes \n
1823/// 01: 8 unsigned words \n
1824/// 10: 16 signed bytes \n
1825/// 11: 8 signed words \n
1826/// Bits [3:2]: Determine comparison type and aggregation method. \n
1827/// 00: Subset: Each character in \a B is compared for equality with all
1828/// the characters in \a A. \n
1829/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1830/// basis is greater than or equal for even-indexed elements in \a A,
1831/// and less than or equal for odd-indexed elements in \a A. \n
1832/// 10: Match: Compare each pair of corresponding characters in \a A and
1833/// \a B for equality. \n
1834/// 11: Substring: Search \a B for substring matches of \a A. \n
1835/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1836/// mask of the comparison results. \n
1837/// 00: No effect. \n
1838/// 01: Negate the bit mask. \n
1839/// 10: No effect. \n
1840/// 11: Negate the bit mask only for bits with an index less than or equal
1841/// to the size of \a A or \a B. \n
1842/// \returns Returns 1 if the bit mask is zero and the length of the string in
1843/// \a B is the maximum; otherwise, returns 0.
1844#define _mm_cmpistra(A, B, M) \
1845 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1846 (__v16qi)(__m128i)(B), (int)(M)))
1847
1848/// Uses the immediate operand \a M to perform a comparison of string
1849/// data with implicitly defined lengths that is contained in source operands
1850/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1851/// 0.
1852///
1853/// \headerfile <x86intrin.h>
1854///
1855/// \code
1856/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1857/// \endcode
1858///
1859/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1860/// instruction.
1861///
1862/// \param A
1863/// A 128-bit integer vector containing one of the source operands to be
1864/// compared.
1865/// \param B
1866/// A 128-bit integer vector containing one of the source operands to be
1867/// compared.
1868/// \param M
1869/// An 8-bit immediate operand specifying whether the characters are bytes or
1870/// words and the type of comparison to perform. \n
1871/// Bits [1:0]: Determine source data format. \n
1872/// 00: 16 unsigned bytes \n
1873/// 01: 8 unsigned words \n
1874/// 10: 16 signed bytes \n
1875/// 11: 8 signed words \n
1876/// Bits [3:2]: Determine comparison type and aggregation method. \n
1877/// 00: Subset: Each character in \a B is compared for equality with all
1878/// the characters in \a A. \n
1879/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1880/// basis is greater than or equal for even-indexed elements in \a A,
1881/// and less than or equal for odd-indexed elements in \a A. \n
1882/// 10: Match: Compare each pair of corresponding characters in \a A and
1883/// \a B for equality. \n
1884/// 11: Substring: Search B for substring matches of \a A. \n
1885/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1886/// mask of the comparison results. \n
1887/// 00: No effect. \n
1888/// 01: Negate the bit mask. \n
1889/// 10: No effect. \n
1890/// 11: Negate the bit mask only for bits with an index less than or equal
1891/// to the size of \a A or \a B.
1892/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1893#define _mm_cmpistrc(A, B, M) \
1894 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1895 (__v16qi)(__m128i)(B), (int)(M)))
1896
1897/// Uses the immediate operand \a M to perform a comparison of string
1898/// data with implicitly defined lengths that is contained in source operands
1899/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1900///
1901/// \headerfile <x86intrin.h>
1902///
1903/// \code
1904/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1905/// \endcode
1906///
1907/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1908/// instruction.
1909///
1910/// \param A
1911/// A 128-bit integer vector containing one of the source operands to be
1912/// compared.
1913/// \param B
1914/// A 128-bit integer vector containing one of the source operands to be
1915/// compared.
1916/// \param M
1917/// An 8-bit immediate operand specifying whether the characters are bytes or
1918/// words and the type of comparison to perform. \n
1919/// Bits [1:0]: Determine source data format. \n
1920/// 00: 16 unsigned bytes \n
1921/// 01: 8 unsigned words \n
1922/// 10: 16 signed bytes \n
1923/// 11: 8 signed words \n
1924/// Bits [3:2]: Determine comparison type and aggregation method. \n
1925/// 00: Subset: Each character in \a B is compared for equality with all
1926/// the characters in \a A. \n
1927/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1928/// basis is greater than or equal for even-indexed elements in \a A,
1929/// and less than or equal for odd-indexed elements in \a A. \n
1930/// 10: Match: Compare each pair of corresponding characters in \a A and
1931/// \a B for equality. \n
1932/// 11: Substring: Search B for substring matches of \a A. \n
1933/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1934/// mask of the comparison results. \n
1935/// 00: No effect. \n
1936/// 01: Negate the bit mask. \n
1937/// 10: No effect. \n
1938/// 11: Negate the bit mask only for bits with an index less than or equal
1939/// to the size of \a A or \a B. \n
1940/// \returns Returns bit 0 of the resulting bit mask.
1941#define _mm_cmpistro(A, B, M) \
1942 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1943 (__v16qi)(__m128i)(B), (int)(M)))
1944
1945/// Uses the immediate operand \a M to perform a comparison of string
1946/// data with implicitly defined lengths that is contained in source operands
1947/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1948/// the maximum, otherwise, returns 0.
1949///
1950/// \headerfile <x86intrin.h>
1951///
1952/// \code
1953/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1954/// \endcode
1955///
1956/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1957/// instruction.
1958///
1959/// \param A
1960/// A 128-bit integer vector containing one of the source operands to be
1961/// compared.
1962/// \param B
1963/// A 128-bit integer vector containing one of the source operands to be
1964/// compared.
1965/// \param M
1966/// An 8-bit immediate operand specifying whether the characters are bytes or
1967/// words and the type of comparison to perform. \n
1968/// Bits [1:0]: Determine source data format. \n
1969/// 00: 16 unsigned bytes \n
1970/// 01: 8 unsigned words \n
1971/// 10: 16 signed bytes \n
1972/// 11: 8 signed words \n
1973/// Bits [3:2]: Determine comparison type and aggregation method. \n
1974/// 00: Subset: Each character in \a B is compared for equality with all
1975/// the characters in \a A. \n
1976/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1977/// basis is greater than or equal for even-indexed elements in \a A,
1978/// and less than or equal for odd-indexed elements in \a A. \n
1979/// 10: Match: Compare each pair of corresponding characters in \a A and
1980/// \a B for equality. \n
1981/// 11: Substring: Search \a B for substring matches of \a A. \n
1982/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1983/// mask of the comparison results. \n
1984/// 00: No effect. \n
1985/// 01: Negate the bit mask. \n
1986/// 10: No effect. \n
1987/// 11: Negate the bit mask only for bits with an index less than or equal
1988/// to the size of \a A or \a B. \n
1989/// \returns Returns 1 if the length of the string in \a A is less than the
1990/// maximum, otherwise, returns 0.
1991#define _mm_cmpistrs(A, B, M) \
1992 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
1993 (__v16qi)(__m128i)(B), (int)(M)))
1994
1995/// Uses the immediate operand \a M to perform a comparison of string
1996/// data with implicitly defined lengths that is contained in source operands
1997/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
1998/// the maximum, otherwise, returns 0.
1999///
2000/// \headerfile <x86intrin.h>
2001///
2002/// \code
2003/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2004/// \endcode
2005///
2006/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2007/// instruction.
2008///
2009/// \param A
2010/// A 128-bit integer vector containing one of the source operands to be
2011/// compared.
2012/// \param B
2013/// A 128-bit integer vector containing one of the source operands to be
2014/// compared.
2015/// \param M
2016/// An 8-bit immediate operand specifying whether the characters are bytes or
2017/// words and the type of comparison to perform. \n
2018/// Bits [1:0]: Determine source data format. \n
2019/// 00: 16 unsigned bytes \n
2020/// 01: 8 unsigned words \n
2021/// 10: 16 signed bytes \n
2022/// 11: 8 signed words \n
2023/// Bits [3:2]: Determine comparison type and aggregation method. \n
2024/// 00: Subset: Each character in \a B is compared for equality with all
2025/// the characters in \a A. \n
2026/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2027/// basis is greater than or equal for even-indexed elements in \a A,
2028/// and less than or equal for odd-indexed elements in \a A. \n
2029/// 10: Match: Compare each pair of corresponding characters in \a A and
2030/// \a B for equality. \n
2031/// 11: Substring: Search \a B for substring matches of \a A. \n
2032/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2033/// mask of the comparison results. \n
2034/// 00: No effect. \n
2035/// 01: Negate the bit mask. \n
2036/// 10: No effect. \n
2037/// 11: Negate the bit mask only for bits with an index less than or equal
2038/// to the size of \a A or \a B.
2039/// \returns Returns 1 if the length of the string in \a B is less than the
2040/// maximum, otherwise, returns 0.
2041#define _mm_cmpistrz(A, B, M) \
2042 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2043 (__v16qi)(__m128i)(B), (int)(M)))
2044
2045/// Uses the immediate operand \a M to perform a comparison of string
2046/// data with explicitly defined lengths that is contained in source operands
2047/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2048/// string in \a B is the maximum, otherwise, returns 0.
2049///
2050/// \headerfile <x86intrin.h>
2051///
2052/// \code
2053/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2054/// \endcode
2055///
2056/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2057/// instruction.
2058///
2059/// \param A
2060/// A 128-bit integer vector containing one of the source operands to be
2061/// compared.
2062/// \param LA
2063/// An integer that specifies the length of the string in \a A.
2064/// \param B
2065/// A 128-bit integer vector containing one of the source operands to be
2066/// compared.
2067/// \param LB
2068/// An integer that specifies the length of the string in \a B.
2069/// \param M
2070/// An 8-bit immediate operand specifying whether the characters are bytes or
2071/// words and the type of comparison to perform. \n
2072/// Bits [1:0]: Determine source data format. \n
2073/// 00: 16 unsigned bytes \n
2074/// 01: 8 unsigned words \n
2075/// 10: 16 signed bytes \n
2076/// 11: 8 signed words \n
2077/// Bits [3:2]: Determine comparison type and aggregation method. \n
2078/// 00: Subset: Each character in \a B is compared for equality with all
2079/// the characters in \a A. \n
2080/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2081/// basis is greater than or equal for even-indexed elements in \a A,
2082/// and less than or equal for odd-indexed elements in \a A. \n
2083/// 10: Match: Compare each pair of corresponding characters in \a A and
2084/// \a B for equality. \n
2085/// 11: Substring: Search \a B for substring matches of \a A. \n
2086/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2087/// mask of the comparison results. \n
2088/// 00: No effect. \n
2089/// 01: Negate the bit mask. \n
2090/// 10: No effect. \n
2091/// 11: Negate the bit mask only for bits with an index less than or equal
2092/// to the size of \a A or \a B.
2093/// \returns Returns 1 if the bit mask is zero and the length of the string in
2094/// \a B is the maximum, otherwise, returns 0.
2095#define _mm_cmpestra(A, LA, B, LB, M) \
2096 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2097 (__v16qi)(__m128i)(B), (int)(LB), \
2098 (int)(M)))
2099
2100/// Uses the immediate operand \a M to perform a comparison of string
2101/// data with explicitly defined lengths that is contained in source operands
2102/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2103/// returns 0.
2104///
2105/// \headerfile <x86intrin.h>
2106///
2107/// \code
2108/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2109/// \endcode
2110///
2111/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2112/// instruction.
2113///
2114/// \param A
2115/// A 128-bit integer vector containing one of the source operands to be
2116/// compared.
2117/// \param LA
2118/// An integer that specifies the length of the string in \a A.
2119/// \param B
2120/// A 128-bit integer vector containing one of the source operands to be
2121/// compared.
2122/// \param LB
2123/// An integer that specifies the length of the string in \a B.
2124/// \param M
2125/// An 8-bit immediate operand specifying whether the characters are bytes or
2126/// words and the type of comparison to perform. \n
2127/// Bits [1:0]: Determine source data format. \n
2128/// 00: 16 unsigned bytes \n
2129/// 01: 8 unsigned words \n
2130/// 10: 16 signed bytes \n
2131/// 11: 8 signed words \n
2132/// Bits [3:2]: Determine comparison type and aggregation method. \n
2133/// 00: Subset: Each character in \a B is compared for equality with all
2134/// the characters in \a A. \n
2135/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2136/// basis is greater than or equal for even-indexed elements in \a A,
2137/// and less than or equal for odd-indexed elements in \a A. \n
2138/// 10: Match: Compare each pair of corresponding characters in \a A and
2139/// \a B for equality. \n
2140/// 11: Substring: Search \a B for substring matches of \a A. \n
2141/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2142/// mask of the comparison results. \n
2143/// 00: No effect. \n
2144/// 01: Negate the bit mask. \n
2145/// 10: No effect. \n
2146/// 11: Negate the bit mask only for bits with an index less than or equal
2147/// to the size of \a A or \a B. \n
2148/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2149#define _mm_cmpestrc(A, LA, B, LB, M) \
2150 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2151 (__v16qi)(__m128i)(B), (int)(LB), \
2152 (int)(M)))
2153
2154/// Uses the immediate operand \a M to perform a comparison of string
2155/// data with explicitly defined lengths that is contained in source operands
2156/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2157///
2158/// \headerfile <x86intrin.h>
2159///
2160/// \code
2161/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2162/// \endcode
2163///
2164/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2165/// instruction.
2166///
2167/// \param A
2168/// A 128-bit integer vector containing one of the source operands to be
2169/// compared.
2170/// \param LA
2171/// An integer that specifies the length of the string in \a A.
2172/// \param B
2173/// A 128-bit integer vector containing one of the source operands to be
2174/// compared.
2175/// \param LB
2176/// An integer that specifies the length of the string in \a B.
2177/// \param M
2178/// An 8-bit immediate operand specifying whether the characters are bytes or
2179/// words and the type of comparison to perform. \n
2180/// Bits [1:0]: Determine source data format. \n
2181/// 00: 16 unsigned bytes \n
2182/// 01: 8 unsigned words \n
2183/// 10: 16 signed bytes \n
2184/// 11: 8 signed words \n
2185/// Bits [3:2]: Determine comparison type and aggregation method. \n
2186/// 00: Subset: Each character in \a B is compared for equality with all
2187/// the characters in \a A. \n
2188/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2189/// basis is greater than or equal for even-indexed elements in \a A,
2190/// and less than or equal for odd-indexed elements in \a A. \n
2191/// 10: Match: Compare each pair of corresponding characters in \a A and
2192/// \a B for equality. \n
2193/// 11: Substring: Search \a B for substring matches of \a A. \n
2194/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2195/// mask of the comparison results. \n
2196/// 00: No effect. \n
2197/// 01: Negate the bit mask. \n
2198/// 10: No effect. \n
2199/// 11: Negate the bit mask only for bits with an index less than or equal
2200/// to the size of \a A or \a B.
2201/// \returns Returns bit 0 of the resulting bit mask.
2202#define _mm_cmpestro(A, LA, B, LB, M) \
2203 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2204 (__v16qi)(__m128i)(B), (int)(LB), \
2205 (int)(M)))
2206
2207/// Uses the immediate operand \a M to perform a comparison of string
2208/// data with explicitly defined lengths that is contained in source operands
2209/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2210/// the maximum, otherwise, returns 0.
2211///
2212/// \headerfile <x86intrin.h>
2213///
2214/// \code
2215/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2216/// \endcode
2217///
2218/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2219/// instruction.
2220///
2221/// \param A
2222/// A 128-bit integer vector containing one of the source operands to be
2223/// compared.
2224/// \param LA
2225/// An integer that specifies the length of the string in \a A.
2226/// \param B
2227/// A 128-bit integer vector containing one of the source operands to be
2228/// compared.
2229/// \param LB
2230/// An integer that specifies the length of the string in \a B.
2231/// \param M
2232/// An 8-bit immediate operand specifying whether the characters are bytes or
2233/// words and the type of comparison to perform. \n
2234/// Bits [1:0]: Determine source data format. \n
2235/// 00: 16 unsigned bytes \n
2236/// 01: 8 unsigned words \n
2237/// 10: 16 signed bytes \n
2238/// 11: 8 signed words \n
2239/// Bits [3:2]: Determine comparison type and aggregation method. \n
2240/// 00: Subset: Each character in \a B is compared for equality with all
2241/// the characters in \a A. \n
2242/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2243/// basis is greater than or equal for even-indexed elements in \a A,
2244/// and less than or equal for odd-indexed elements in \a A. \n
2245/// 10: Match: Compare each pair of corresponding characters in \a A and
2246/// \a B for equality. \n
2247/// 11: Substring: Search \a B for substring matches of \a A. \n
2248/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2249/// mask of the comparison results. \n
2250/// 00: No effect. \n
2251/// 01: Negate the bit mask. \n
2252/// 10: No effect. \n
2253/// 11: Negate the bit mask only for bits with an index less than or equal
2254/// to the size of \a A or \a B. \n
2255/// \returns Returns 1 if the length of the string in \a A is less than the
2256/// maximum, otherwise, returns 0.
2257#define _mm_cmpestrs(A, LA, B, LB, M) \
2258 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2259 (__v16qi)(__m128i)(B), (int)(LB), \
2260 (int)(M)))
2261
2262/// Uses the immediate operand \a M to perform a comparison of string
2263/// data with explicitly defined lengths that is contained in source operands
2264/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2265/// the maximum, otherwise, returns 0.
2266///
2267/// \headerfile <x86intrin.h>
2268///
2269/// \code
2270/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2271/// \endcode
2272///
2273/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2274///
2275/// \param A
2276/// A 128-bit integer vector containing one of the source operands to be
2277/// compared.
2278/// \param LA
2279/// An integer that specifies the length of the string in \a A.
2280/// \param B
2281/// A 128-bit integer vector containing one of the source operands to be
2282/// compared.
2283/// \param LB
2284/// An integer that specifies the length of the string in \a B.
2285/// \param M
2286/// An 8-bit immediate operand specifying whether the characters are bytes or
2287/// words and the type of comparison to perform. \n
2288/// Bits [1:0]: Determine source data format. \n
2289/// 00: 16 unsigned bytes \n
2290/// 01: 8 unsigned words \n
2291/// 10: 16 signed bytes \n
2292/// 11: 8 signed words \n
2293/// Bits [3:2]: Determine comparison type and aggregation method. \n
2294/// 00: Subset: Each character in \a B is compared for equality with all
2295/// the characters in \a A. \n
2296/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2297/// basis is greater than or equal for even-indexed elements in \a A,
2298/// and less than or equal for odd-indexed elements in \a A. \n
2299/// 10: Match: Compare each pair of corresponding characters in \a A and
2300/// \a B for equality. \n
2301/// 11: Substring: Search \a B for substring matches of \a A. \n
2302/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2303/// mask of the comparison results. \n
2304/// 00: No effect. \n
2305/// 01: Negate the bit mask. \n
2306/// 10: No effect. \n
2307/// 11: Negate the bit mask only for bits with an index less than or equal
2308/// to the size of \a A or \a B.
2309/// \returns Returns 1 if the length of the string in \a B is less than the
2310/// maximum, otherwise, returns 0.
2311#define _mm_cmpestrz(A, LA, B, LB, M) \
2312 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2313 (__v16qi)(__m128i)(B), (int)(LB), \
2314 (int)(M)))
2315
2316/* SSE4.2 Compare Packed Data -- Greater Than. */
2317/// Compares each of the corresponding 64-bit values of the 128-bit
2318/// integer vectors to determine if the values in the first operand are
2319/// greater than those in the second operand.
2320///
2321/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2322///
2323/// \headerfile <x86intrin.h>
2324///
2325/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2326///
2327/// \param __V1
2328/// A 128-bit integer vector.
2329/// \param __V2
2330/// A 128-bit integer vector.
2331/// \returns A 128-bit integer vector containing the comparison results.
2332static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2333_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) {
2334 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2335}
2336
2337#undef __DEFAULT_FN_ATTRS
2338#undef __DEFAULT_FN_ATTRS_CONSTEXPR
2339
2340#include <popcntintrin.h>
2341
2342#include <crc32intrin.h>
2343
2344#endif /* __SMMINTRIN_H */
#define __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition smmintrin.h:562
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition smmintrin.h:706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128(const void *__V)
Loads integer values from a 128-bit aligned memory location to a 128-bit integer vector.
Definition smmintrin.h:651
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition smmintrin.h:760
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition smmintrin.h:2333
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition smmintrin.h:1206
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition smmintrin.h:688
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition smmintrin.h:495
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition smmintrin.h:443
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition smmintrin.h:469
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition smmintrin.h:1348
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition smmintrin.h:1426
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition smmintrin.h:1469
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition smmintrin.h:796
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition smmintrin.h:1445
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition smmintrin.h:1248
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition smmintrin.h:1309
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition smmintrin.h:1369
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition smmintrin.h:1269
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition smmintrin.h:1131
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition smmintrin.h:742
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition smmintrin.h:1113
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition smmintrin.h:543
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition smmintrin.h:1290
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition smmintrin.h:778
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition smmintrin.h:1328
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition smmintrin.h:1407
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition smmintrin.h:724
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition smmintrin.h:1225
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition smmintrin.h:1388
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition smmintrin.h:1527
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition smmintrin.h:1096
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition smmintrin.h:670