clang 22.0.0git
smmintrin.h
Go to the documentation of this file.
1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __SMMINTRIN_H
11#define __SMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <tmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
20#define __DEFAULT_FN_ATTRS \
21 __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
22 __min_vector_width__(128)))
23
24#if defined(__cplusplus) && (__cplusplus >= 201103L)
25#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
26#else
27#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
28#endif
29
30/* SSE4 Rounding macros. */
31#define _MM_FROUND_TO_NEAREST_INT 0x00
32#define _MM_FROUND_TO_NEG_INF 0x01
33#define _MM_FROUND_TO_POS_INF 0x02
34#define _MM_FROUND_TO_ZERO 0x03
35#define _MM_FROUND_CUR_DIRECTION 0x04
36
37#define _MM_FROUND_RAISE_EXC 0x00
38#define _MM_FROUND_NO_EXC 0x08
39
40#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
41#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
42#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
43#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
44#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
45#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
46
47/// Rounds up each element of the 128-bit vector of [4 x float] to an
48/// integer and returns the rounded values in a 128-bit vector of
49/// [4 x float].
50///
51/// \headerfile <x86intrin.h>
52///
53/// \code
54/// __m128 _mm_ceil_ps(__m128 X);
55/// \endcode
56///
57/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
58///
59/// \param X
60/// A 128-bit vector of [4 x float] values to be rounded up.
61/// \returns A 128-bit vector of [4 x float] containing the rounded values.
62#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
63
64/// Rounds up each element of the 128-bit vector of [2 x double] to an
65/// integer and returns the rounded values in a 128-bit vector of
66/// [2 x double].
67///
68/// \headerfile <x86intrin.h>
69///
70/// \code
71/// __m128d _mm_ceil_pd(__m128d X);
72/// \endcode
73///
74/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
75///
76/// \param X
77/// A 128-bit vector of [2 x double] values to be rounded up.
78/// \returns A 128-bit vector of [2 x double] containing the rounded values.
79#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
80
81/// Copies three upper elements of the first 128-bit vector operand to
82/// the corresponding three upper elements of the 128-bit result vector of
83/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
84/// operand to an integer and copies it to the lowest element of the 128-bit
85/// result vector of [4 x float].
86///
87/// \headerfile <x86intrin.h>
88///
89/// \code
90/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
91/// \endcode
92///
93/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
94///
95/// \param X
96/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
97/// copied to the corresponding bits of the result.
98/// \param Y
99/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
100/// rounded up to the nearest integer and copied to the corresponding bits
101/// of the result.
102/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
103/// values.
104#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
105
106/// Copies the upper element of the first 128-bit vector operand to the
107/// corresponding upper element of the 128-bit result vector of [2 x double].
108/// Rounds up the lower element of the second 128-bit vector operand to an
109/// integer and copies it to the lower element of the 128-bit result vector
110/// of [2 x double].
111///
112/// \headerfile <x86intrin.h>
113///
114/// \code
115/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
116/// \endcode
117///
118/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
119///
120/// \param X
121/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
122/// copied to the corresponding bits of the result.
123/// \param Y
124/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
125/// rounded up to the nearest integer and copied to the corresponding bits
126/// of the result.
127/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
128/// values.
129#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
130
131/// Rounds down each element of the 128-bit vector of [4 x float] to an
132/// an integer and returns the rounded values in a 128-bit vector of
133/// [4 x float].
134///
135/// \headerfile <x86intrin.h>
136///
137/// \code
138/// __m128 _mm_floor_ps(__m128 X);
139/// \endcode
140///
141/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
142///
143/// \param X
144/// A 128-bit vector of [4 x float] values to be rounded down.
145/// \returns A 128-bit vector of [4 x float] containing the rounded values.
146#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
147
148/// Rounds down each element of the 128-bit vector of [2 x double] to an
149/// integer and returns the rounded values in a 128-bit vector of
150/// [2 x double].
151///
152/// \headerfile <x86intrin.h>
153///
154/// \code
155/// __m128d _mm_floor_pd(__m128d X);
156/// \endcode
157///
158/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
159///
160/// \param X
161/// A 128-bit vector of [2 x double].
162/// \returns A 128-bit vector of [2 x double] containing the rounded values.
163#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
164
165/// Copies three upper elements of the first 128-bit vector operand to
166/// the corresponding three upper elements of the 128-bit result vector of
167/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
168/// operand to an integer and copies it to the lowest element of the 128-bit
169/// result vector of [4 x float].
170///
171/// \headerfile <x86intrin.h>
172///
173/// \code
174/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
175/// \endcode
176///
177/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
178///
179/// \param X
180/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
181/// copied to the corresponding bits of the result.
182/// \param Y
183/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
184/// rounded down to the nearest integer and copied to the corresponding bits
185/// of the result.
186/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
187/// values.
188#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
189
190/// Copies the upper element of the first 128-bit vector operand to the
191/// corresponding upper element of the 128-bit result vector of [2 x double].
192/// Rounds down the lower element of the second 128-bit vector operand to an
193/// integer and copies it to the lower element of the 128-bit result vector
194/// of [2 x double].
195///
196/// \headerfile <x86intrin.h>
197///
198/// \code
199/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
200/// \endcode
201///
202/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
203///
204/// \param X
205/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
206/// copied to the corresponding bits of the result.
207/// \param Y
208/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
209/// rounded down to the nearest integer and copied to the corresponding bits
210/// of the result.
211/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
212/// values.
213#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
214
215/// Rounds each element of the 128-bit vector of [4 x float] to an
216/// integer value according to the rounding control specified by the second
217/// argument and returns the rounded values in a 128-bit vector of
218/// [4 x float].
219///
220/// \headerfile <x86intrin.h>
221///
222/// \code
223/// __m128 _mm_round_ps(__m128 X, const int M);
224/// \endcode
225///
226/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
227///
228/// \param X
229/// A 128-bit vector of [4 x float].
230/// \param M
231/// An integer value that specifies the rounding operation. \n
232/// Bits [7:4] are reserved. \n
233/// Bit [3] is a precision exception value: \n
234/// 0: A normal PE exception is used \n
235/// 1: The PE field is not updated \n
236/// Bit [2] is the rounding control source: \n
237/// 0: Use bits [1:0] of \a M \n
238/// 1: Use the current MXCSR setting \n
239/// Bits [1:0] contain the rounding control definition: \n
240/// 00: Nearest \n
241/// 01: Downward (toward negative infinity) \n
242/// 10: Upward (toward positive infinity) \n
243/// 11: Truncated
244/// \returns A 128-bit vector of [4 x float] containing the rounded values.
245#define _mm_round_ps(X, M) \
246 ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
247
248/// Copies three upper elements of the first 128-bit vector operand to
249/// the corresponding three upper elements of the 128-bit result vector of
250/// [4 x float]. Rounds the lowest element of the second 128-bit vector
251/// operand to an integer value according to the rounding control specified
252/// by the third argument and copies it to the lowest element of the 128-bit
253/// result vector of [4 x float].
254///
255/// \headerfile <x86intrin.h>
256///
257/// \code
258/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
259/// \endcode
260///
261/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
262///
263/// \param X
264/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
265/// copied to the corresponding bits of the result.
266/// \param Y
267/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
268/// rounded to the nearest integer using the specified rounding control and
269/// copied to the corresponding bits of the result.
270/// \param M
271/// An integer value that specifies the rounding operation. \n
272/// Bits [7:4] are reserved. \n
273/// Bit [3] is a precision exception value: \n
274/// 0: A normal PE exception is used \n
275/// 1: The PE field is not updated \n
276/// Bit [2] is the rounding control source: \n
277/// 0: Use bits [1:0] of \a M \n
278/// 1: Use the current MXCSR setting \n
279/// Bits [1:0] contain the rounding control definition: \n
280/// 00: Nearest \n
281/// 01: Downward (toward negative infinity) \n
282/// 10: Upward (toward positive infinity) \n
283/// 11: Truncated
284/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
285/// values.
286#define _mm_round_ss(X, Y, M) \
287 ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
288 (M)))
289
290/// Rounds each element of the 128-bit vector of [2 x double] to an
291/// integer value according to the rounding control specified by the second
292/// argument and returns the rounded values in a 128-bit vector of
293/// [2 x double].
294///
295/// \headerfile <x86intrin.h>
296///
297/// \code
298/// __m128d _mm_round_pd(__m128d X, const int M);
299/// \endcode
300///
301/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
302///
303/// \param X
304/// A 128-bit vector of [2 x double].
305/// \param M
306/// An integer value that specifies the rounding operation. \n
307/// Bits [7:4] are reserved. \n
308/// Bit [3] is a precision exception value: \n
309/// 0: A normal PE exception is used \n
310/// 1: The PE field is not updated \n
311/// Bit [2] is the rounding control source: \n
312/// 0: Use bits [1:0] of \a M \n
313/// 1: Use the current MXCSR setting \n
314/// Bits [1:0] contain the rounding control definition: \n
315/// 00: Nearest \n
316/// 01: Downward (toward negative infinity) \n
317/// 10: Upward (toward positive infinity) \n
318/// 11: Truncated
319/// \returns A 128-bit vector of [2 x double] containing the rounded values.
320#define _mm_round_pd(X, M) \
321 ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
322
323/// Copies the upper element of the first 128-bit vector operand to the
324/// corresponding upper element of the 128-bit result vector of [2 x double].
325/// Rounds the lower element of the second 128-bit vector operand to an
326/// integer value according to the rounding control specified by the third
327/// argument and copies it to the lower element of the 128-bit result vector
328/// of [2 x double].
329///
330/// \headerfile <x86intrin.h>
331///
332/// \code
333/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
334/// \endcode
335///
336/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
337///
338/// \param X
339/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
340/// copied to the corresponding bits of the result.
341/// \param Y
342/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
343/// rounded to the nearest integer using the specified rounding control and
344/// copied to the corresponding bits of the result.
345/// \param M
346/// An integer value that specifies the rounding operation. \n
347/// Bits [7:4] are reserved. \n
348/// Bit [3] is a precision exception value: \n
349/// 0: A normal PE exception is used \n
350/// 1: The PE field is not updated \n
351/// Bit [2] is the rounding control source: \n
352/// 0: Use bits [1:0] of \a M \n
353/// 1: Use the current MXCSR setting \n
354/// Bits [1:0] contain the rounding control definition: \n
355/// 00: Nearest \n
356/// 01: Downward (toward negative infinity) \n
357/// 10: Upward (toward positive infinity) \n
358/// 11: Truncated
359/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
360/// values.
361#define _mm_round_sd(X, Y, M) \
362 ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
363 (M)))
364
365/* SSE4 Packed Blending Intrinsics. */
366/// Returns a 128-bit vector of [2 x double] where the values are
367/// selected from either the first or second operand as specified by the
368/// third operand, the control mask.
369///
370/// \headerfile <x86intrin.h>
371///
372/// \code
373/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
374/// \endcode
375///
376/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
377///
378/// \param V1
379/// A 128-bit vector of [2 x double].
380/// \param V2
381/// A 128-bit vector of [2 x double].
382/// \param M
383/// An immediate integer operand, with mask bits [1:0] specifying how the
384/// values are to be copied. The position of the mask bit corresponds to the
385/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
386/// element in operand \a V1 is copied to the same position in the result.
387/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
388/// is copied to the same position in the result.
389/// \returns A 128-bit vector of [2 x double] containing the copied values.
390#define _mm_blend_pd(V1, V2, M) \
391 ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
392 (__v2df)(__m128d)(V2), (int)(M)))
393
394/// Returns a 128-bit vector of [4 x float] where the values are selected
395/// from either the first or second operand as specified by the third
396/// operand, the control mask.
397///
398/// \headerfile <x86intrin.h>
399///
400/// \code
401/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
402/// \endcode
403///
404/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
405///
406/// \param V1
407/// A 128-bit vector of [4 x float].
408/// \param V2
409/// A 128-bit vector of [4 x float].
410/// \param M
411/// An immediate integer operand, with mask bits [3:0] specifying how the
412/// values are to be copied. The position of the mask bit corresponds to the
413/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
414/// element in operand \a V1 is copied to the same position in the result.
415/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
416/// is copied to the same position in the result.
417/// \returns A 128-bit vector of [4 x float] containing the copied values.
418#define _mm_blend_ps(V1, V2, M) \
419 ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
420 (int)(M)))
421
422/// Returns a 128-bit vector of [2 x double] where the values are
423/// selected from either the first or second operand as specified by the
424/// third operand, the control mask.
425///
426/// \headerfile <x86intrin.h>
427///
428/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
429///
430/// \param __V1
431/// A 128-bit vector of [2 x double].
432/// \param __V2
433/// A 128-bit vector of [2 x double].
434/// \param __M
435/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
436/// values are to be copied. The position of the mask bit corresponds to the
437/// most significant bit of a copied value. When a mask bit is 0, the
438/// corresponding 64-bit element in operand \a __V1 is copied to the same
439/// position in the result. When a mask bit is 1, the corresponding 64-bit
440/// element in operand \a __V2 is copied to the same position in the result.
441/// \returns A 128-bit vector of [2 x double] containing the copied values.
442static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
443_mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M) {
444 return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
445 (__v2df)__M);
446}
447
448/// Returns a 128-bit vector of [4 x float] where the values are
449/// selected from either the first or second operand as specified by the
450/// third operand, the control mask.
451///
452/// \headerfile <x86intrin.h>
453///
454/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
455///
456/// \param __V1
457/// A 128-bit vector of [4 x float].
458/// \param __V2
459/// A 128-bit vector of [4 x float].
460/// \param __M
461/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
462/// how the values are to be copied. The position of the mask bit corresponds
463/// to the most significant bit of a copied value. When a mask bit is 0, the
464/// corresponding 32-bit element in operand \a __V1 is copied to the same
465/// position in the result. When a mask bit is 1, the corresponding 32-bit
466/// element in operand \a __V2 is copied to the same position in the result.
467/// \returns A 128-bit vector of [4 x float] containing the copied values.
468static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
469_mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M) {
470 return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
471 (__v4sf)__M);
472}
473
474/// Returns a 128-bit vector of [16 x i8] where the values are selected
475/// from either of the first or second operand as specified by the third
476/// operand, the control mask.
477///
478/// \headerfile <x86intrin.h>
479///
480/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
481///
482/// \param __V1
483/// A 128-bit vector of [16 x i8].
484/// \param __V2
485/// A 128-bit vector of [16 x i8].
486/// \param __M
487/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
488/// how the values are to be copied. The position of the mask bit corresponds
489/// to the most significant bit of a copied value. When a mask bit is 0, the
490/// corresponding 8-bit element in operand \a __V1 is copied to the same
491/// position in the result. When a mask bit is 1, the corresponding 8-bit
492/// element in operand \a __V2 is copied to the same position in the result.
493/// \returns A 128-bit vector of [16 x i8] containing the copied values.
494static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
495_mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M) {
496 return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
497 (__v16qi)__M);
498}
499
500/// Returns a 128-bit vector of [8 x i16] where the values are selected
501/// from either of the first or second operand as specified by the third
502/// operand, the control mask.
503///
504/// \headerfile <x86intrin.h>
505///
506/// \code
507/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
508/// \endcode
509///
510/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
511///
512/// \param V1
513/// A 128-bit vector of [8 x i16].
514/// \param V2
515/// A 128-bit vector of [8 x i16].
516/// \param M
517/// An immediate integer operand, with mask bits [7:0] specifying how the
518/// values are to be copied. The position of the mask bit corresponds to the
519/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
520/// element in operand \a V1 is copied to the same position in the result.
521/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
522/// is copied to the same position in the result.
523/// \returns A 128-bit vector of [8 x i16] containing the copied values.
524#define _mm_blend_epi16(V1, V2, M) \
525 ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
526 (__v8hi)(__m128i)(V2), (int)(M)))
527
528/* SSE4 Dword Multiply Instructions. */
529/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
530/// and returns the lower 32 bits of the each product in a 128-bit vector of
531/// [4 x i32].
532///
533/// \headerfile <x86intrin.h>
534///
535/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
536///
537/// \param __V1
538/// A 128-bit integer vector.
539/// \param __V2
540/// A 128-bit integer vector.
541/// \returns A 128-bit integer vector containing the products of both operands.
542static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
543_mm_mullo_epi32(__m128i __V1, __m128i __V2) {
544 return (__m128i)((__v4su)__V1 * (__v4su)__V2);
545}
546
547/// Multiplies corresponding even-indexed elements of two 128-bit
548/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
549/// containing the products.
550///
551/// \headerfile <x86intrin.h>
552///
553/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
554///
555/// \param __V1
556/// A 128-bit vector of [4 x i32].
557/// \param __V2
558/// A 128-bit vector of [4 x i32].
559/// \returns A 128-bit vector of [2 x i64] containing the products of both
560/// operands.
561static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
562_mm_mul_epi32(__m128i __V1, __m128i __V2) {
563 return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
564}
565
566/* SSE4 Floating Point Dot Product Instructions. */
567/// Computes the dot product of the two 128-bit vectors of [4 x float]
568/// and returns it in the elements of the 128-bit result vector of
569/// [4 x float].
570///
571/// The immediate integer operand controls which input elements
572/// will contribute to the dot product, and where the final results are
573/// returned.
574///
575/// \headerfile <x86intrin.h>
576///
577/// \code
578/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
579/// \endcode
580///
581/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
582///
583/// \param X
584/// A 128-bit vector of [4 x float].
585/// \param Y
586/// A 128-bit vector of [4 x float].
587/// \param M
588/// An immediate integer operand. Mask bits [7:4] determine which elements
589/// of the input vectors are used, with bit [4] corresponding to the lowest
590/// element and bit [7] corresponding to the highest element of each [4 x
591/// float] vector. If a bit is set, the corresponding elements from the two
592/// input vectors are used as an input for dot product; otherwise that input
593/// is treated as zero. Bits [3:0] determine which elements of the result
594/// will receive a copy of the final dot product, with bit [0] corresponding
595/// to the lowest element and bit [3] corresponding to the highest element of
596/// each [4 x float] subvector. If a bit is set, the dot product is returned
597/// in the corresponding element; otherwise that element is set to zero.
598/// \returns A 128-bit vector of [4 x float] containing the dot product.
599#define _mm_dp_ps(X, Y, M) \
600 ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
601
602/// Computes the dot product of the two 128-bit vectors of [2 x double]
603/// and returns it in the elements of the 128-bit result vector of
604/// [2 x double].
605///
606/// The immediate integer operand controls which input
607/// elements will contribute to the dot product, and where the final results
608/// are returned.
609///
610/// \headerfile <x86intrin.h>
611///
612/// \code
613/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
614/// \endcode
615///
616/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
617///
618/// \param X
619/// A 128-bit vector of [2 x double].
620/// \param Y
621/// A 128-bit vector of [2 x double].
622/// \param M
623/// An immediate integer operand. Mask bits [5:4] determine which elements
624/// of the input vectors are used, with bit [4] corresponding to the lowest
625/// element and bit [5] corresponding to the highest element of each of [2 x
626/// double] vector. If a bit is set, the corresponding elements from the two
627/// input vectors are used as an input for dot product; otherwise that input
628/// is treated as zero. Bits [1:0] determine which elements of the result
629/// will receive a copy of the final dot product, with bit [0] corresponding
630/// to the lowest element and bit [1] corresponding to the highest element of
631/// each [2 x double] vector. If a bit is set, the dot product is returned in
632/// the corresponding element; otherwise that element is set to zero.
633#define _mm_dp_pd(X, Y, M) \
634 ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
635 (M)))
636
637/* SSE4 Streaming Load Hint Instruction. */
638/// Loads integer values from a 128-bit aligned memory location to a
639/// 128-bit integer vector.
640///
641/// \headerfile <x86intrin.h>
642///
643/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
644///
645/// \param __V
646/// A pointer to a 128-bit aligned memory location that contains the integer
647/// values.
648/// \returns A 128-bit integer vector containing the data stored at the
649/// specified memory location.
650static __inline__ __m128i __DEFAULT_FN_ATTRS
651_mm_stream_load_si128(const void *__V) {
652 return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
653}
654
655/* SSE4 Packed Integer Min/Max Instructions. */
656/// Compares the corresponding elements of two 128-bit vectors of
657/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
658/// of the two values.
659///
660/// \headerfile <x86intrin.h>
661///
662/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
663///
664/// \param __V1
665/// A 128-bit vector of [16 x i8].
666/// \param __V2
667/// A 128-bit vector of [16 x i8]
668/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
669static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
670_mm_min_epi8(__m128i __V1, __m128i __V2) {
671 return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
672}
673
674/// Compares the corresponding elements of two 128-bit vectors of
675/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
676/// greater value of the two.
677///
678/// \headerfile <x86intrin.h>
679///
680/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
681///
682/// \param __V1
683/// A 128-bit vector of [16 x i8].
684/// \param __V2
685/// A 128-bit vector of [16 x i8].
686/// \returns A 128-bit vector of [16 x i8] containing the greater values.
687static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
688_mm_max_epi8(__m128i __V1, __m128i __V2) {
689 return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
690}
691
692/// Compares the corresponding elements of two 128-bit vectors of
693/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
694/// value of the two.
695///
696/// \headerfile <x86intrin.h>
697///
698/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
699///
700/// \param __V1
701/// A 128-bit vector of [8 x u16].
702/// \param __V2
703/// A 128-bit vector of [8 x u16].
704/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
705static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
706_mm_min_epu16(__m128i __V1, __m128i __V2) {
707 return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
708}
709
710/// Compares the corresponding elements of two 128-bit vectors of
711/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
712/// greater value of the two.
713///
714/// \headerfile <x86intrin.h>
715///
716/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
717///
718/// \param __V1
719/// A 128-bit vector of [8 x u16].
720/// \param __V2
721/// A 128-bit vector of [8 x u16].
722/// \returns A 128-bit vector of [8 x u16] containing the greater values.
723static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
724_mm_max_epu16(__m128i __V1, __m128i __V2) {
725 return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
726}
727
728/// Compares the corresponding elements of two 128-bit vectors of
729/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
730/// value of the two.
731///
732/// \headerfile <x86intrin.h>
733///
734/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
735///
736/// \param __V1
737/// A 128-bit vector of [4 x i32].
738/// \param __V2
739/// A 128-bit vector of [4 x i32].
740/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
741static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
742_mm_min_epi32(__m128i __V1, __m128i __V2) {
743 return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
744}
745
746/// Compares the corresponding elements of two 128-bit vectors of
747/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
748/// greater value of the two.
749///
750/// \headerfile <x86intrin.h>
751///
752/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
753///
754/// \param __V1
755/// A 128-bit vector of [4 x i32].
756/// \param __V2
757/// A 128-bit vector of [4 x i32].
758/// \returns A 128-bit vector of [4 x i32] containing the greater values.
759static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
760_mm_max_epi32(__m128i __V1, __m128i __V2) {
761 return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
762}
763
764/// Compares the corresponding elements of two 128-bit vectors of
765/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
766/// value of the two.
767///
768/// \headerfile <x86intrin.h>
769///
770/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
771///
772/// \param __V1
773/// A 128-bit vector of [4 x u32].
774/// \param __V2
775/// A 128-bit vector of [4 x u32].
776/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
777static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
778_mm_min_epu32(__m128i __V1, __m128i __V2) {
779 return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
780}
781
782/// Compares the corresponding elements of two 128-bit vectors of
783/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
784/// greater value of the two.
785///
786/// \headerfile <x86intrin.h>
787///
788/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
789///
790/// \param __V1
791/// A 128-bit vector of [4 x u32].
792/// \param __V2
793/// A 128-bit vector of [4 x u32].
794/// \returns A 128-bit vector of [4 x u32] containing the greater values.
795static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
796_mm_max_epu32(__m128i __V1, __m128i __V2) {
797 return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
798}
799
800/* SSE4 Insertion and Extraction from XMM Register Instructions. */
801/// Takes the first argument \a X and inserts an element from the second
802/// argument \a Y as selected by the third argument \a N. That result then
803/// has elements zeroed out also as selected by the third argument \a N. The
804/// resulting 128-bit vector of [4 x float] is then returned.
805///
806/// \headerfile <x86intrin.h>
807///
808/// \code
809/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
810/// \endcode
811///
812/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
813///
814/// \param X
815/// A 128-bit vector source operand of [4 x float]. With the exception of
816/// those bits in the result copied from parameter \a Y and zeroed by bits
817/// [3:0] of \a N, all bits from this parameter are copied to the result.
818/// \param Y
819/// A 128-bit vector source operand of [4 x float]. One single-precision
820/// floating-point element from this source, as determined by the immediate
821/// parameter, is copied to the result.
822/// \param N
823/// Specifies which bits from operand \a Y will be copied, which bits in the
824/// result they will be copied to, and which bits in the result will be
825/// cleared. The following assignments are made: \n
826/// Bits [7:6] specify the bits to copy from operand \a Y: \n
827/// 00: Selects bits [31:0] from operand \a Y. \n
828/// 01: Selects bits [63:32] from operand \a Y. \n
829/// 10: Selects bits [95:64] from operand \a Y. \n
830/// 11: Selects bits [127:96] from operand \a Y. \n
831/// Bits [5:4] specify the bits in the result to which the selected bits
832/// from operand \a Y are copied: \n
833/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
834/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
835/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
836/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
837/// Bits[3:0]: If any of these bits are set, the corresponding result
838/// element is cleared.
839/// \returns A 128-bit vector of [4 x float] containing the copied
840/// single-precision floating point elements from the operands.
841#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
842
843/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
844/// returns it, using the immediate value parameter \a N as a selector.
845///
846/// \headerfile <x86intrin.h>
847///
848/// \code
849/// int _mm_extract_ps(__m128 X, const int N);
850/// \endcode
851///
852/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
853/// instruction.
854///
855/// \param X
856/// A 128-bit vector of [4 x float].
857/// \param N
858/// An immediate value. Bits [1:0] determines which bits from the argument
859/// \a X are extracted and returned: \n
860/// 00: Bits [31:0] of parameter \a X are returned. \n
861/// 01: Bits [63:32] of parameter \a X are returned. \n
862/// 10: Bits [95:64] of parameter \a X are returned. \n
863/// 11: Bits [127:96] of parameter \a X are returned.
864/// \returns A 32-bit integer containing the extracted 32 bits of float data.
865#define _mm_extract_ps(X, N) \
866 __builtin_bit_cast( \
867 int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
868
869/* Miscellaneous insert and extract macros. */
870/* Extract a single-precision float from X at index N into D. */
871#define _MM_EXTRACT_FLOAT(D, X, N) \
872 do { \
873 (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
874 } while (0)
875
876/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
877 an index suitable for _mm_insert_ps. */
878#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
879
880/* Extract a float from X at index N into the first index of the return. */
881#define _MM_PICK_OUT_PS(X, N) \
882 _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
883
884/* Insert int into packed integer array at index. */
885/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
886/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
887/// of an integer parameter \a I into an offset specified by the immediate
888/// value parameter \a N.
889///
890/// \headerfile <x86intrin.h>
891///
892/// \code
893/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
894/// \endcode
895///
896/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
897///
898/// \param X
899/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
900/// result and then one of the sixteen elements in the result vector is
901/// replaced by the lower 8 bits of \a I.
902/// \param I
903/// An integer. The lower 8 bits of this operand are written to the result
904/// beginning at the offset specified by \a N.
905/// \param N
906/// An immediate value. Bits [3:0] specify the bit offset in the result at
907/// which the lower 8 bits of \a I are written. \n
908/// 0000: Bits [7:0] of the result are used for insertion. \n
909/// 0001: Bits [15:8] of the result are used for insertion. \n
910/// 0010: Bits [23:16] of the result are used for insertion. \n
911/// 0011: Bits [31:24] of the result are used for insertion. \n
912/// 0100: Bits [39:32] of the result are used for insertion. \n
913/// 0101: Bits [47:40] of the result are used for insertion. \n
914/// 0110: Bits [55:48] of the result are used for insertion. \n
915/// 0111: Bits [63:56] of the result are used for insertion. \n
916/// 1000: Bits [71:64] of the result are used for insertion. \n
917/// 1001: Bits [79:72] of the result are used for insertion. \n
918/// 1010: Bits [87:80] of the result are used for insertion. \n
919/// 1011: Bits [95:88] of the result are used for insertion. \n
920/// 1100: Bits [103:96] of the result are used for insertion. \n
921/// 1101: Bits [111:104] of the result are used for insertion. \n
922/// 1110: Bits [119:112] of the result are used for insertion. \n
923/// 1111: Bits [127:120] of the result are used for insertion.
924/// \returns A 128-bit integer vector containing the constructed values.
925#define _mm_insert_epi8(X, I, N) \
926 ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
927 (int)(N)))
928
929/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
930/// the 128-bit integer vector parameter, and then inserting the 32-bit
931/// integer parameter \a I at the offset specified by the immediate value
932/// parameter \a N.
933///
934/// \headerfile <x86intrin.h>
935///
936/// \code
937/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
938/// \endcode
939///
940/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
941///
942/// \param X
943/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
944/// result and then one of the four elements in the result vector is
945/// replaced by \a I.
946/// \param I
947/// A 32-bit integer that is written to the result beginning at the offset
948/// specified by \a N.
949/// \param N
950/// An immediate value. Bits [1:0] specify the bit offset in the result at
951/// which the integer \a I is written. \n
952/// 00: Bits [31:0] of the result are used for insertion. \n
953/// 01: Bits [63:32] of the result are used for insertion. \n
954/// 10: Bits [95:64] of the result are used for insertion. \n
955/// 11: Bits [127:96] of the result are used for insertion.
956/// \returns A 128-bit integer vector containing the constructed values.
957#define _mm_insert_epi32(X, I, N) \
958 ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
959 (int)(N)))
960
961#ifdef __x86_64__
962/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
963/// the 128-bit integer vector parameter, and then inserting the 64-bit
964/// integer parameter \a I, using the immediate value parameter \a N as an
965/// insertion location selector.
966///
967/// \headerfile <x86intrin.h>
968///
969/// \code
970/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
971/// \endcode
972///
973/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
974///
975/// \param X
976/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
977/// result and then one of the two elements in the result vector is replaced
978/// by \a I.
979/// \param I
980/// A 64-bit integer that is written to the result beginning at the offset
981/// specified by \a N.
982/// \param N
983/// An immediate value. Bit [0] specifies the bit offset in the result at
984/// which the integer \a I is written. \n
985/// 0: Bits [63:0] of the result are used for insertion. \n
986/// 1: Bits [127:64] of the result are used for insertion. \n
987/// \returns A 128-bit integer vector containing the constructed values.
988#define _mm_insert_epi64(X, I, N) \
989 ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
990 (int)(N)))
991#endif /* __x86_64__ */
992
993/* Extract int from packed integer array at index. This returns the element
994 * as a zero extended value, so it is unsigned.
995 */
996/// Extracts an 8-bit element from the 128-bit integer vector of
997/// [16 x i8], using the immediate value parameter \a N as a selector.
998///
999/// \headerfile <x86intrin.h>
1000///
1001/// \code
1002/// int _mm_extract_epi8(__m128i X, const int N);
1003/// \endcode
1004///
1005/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1006///
1007/// \param X
1008/// A 128-bit integer vector.
1009/// \param N
1010/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1011/// the argument \a X to extract and copy to the result. \n
1012/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1013/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1014/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1015/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1016/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1017/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1018/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1019/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1020/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1021/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1022/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1023/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1024/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1025/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1026/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1027/// 1111: Bits [127:120] of the parameter \a X are extracted.
1028/// \returns An unsigned integer, whose lower 8 bits are selected from the
1029/// 128-bit integer vector parameter and the remaining bits are assigned
1030/// zeros.
1031#define _mm_extract_epi8(X, N) \
1032 ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1033 (int)(N)))
1034
1035/// Extracts a 32-bit element from the 128-bit integer vector of
1036/// [4 x i32], using the immediate value parameter \a N as a selector.
1037///
1038/// \headerfile <x86intrin.h>
1039///
1040/// \code
1041/// int _mm_extract_epi32(__m128i X, const int N);
1042/// \endcode
1043///
1044/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1045///
1046/// \param X
1047/// A 128-bit integer vector.
1048/// \param N
1049/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1050/// the argument \a X to extract and copy to the result. \n
1051/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1052/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1053/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1054/// 11: Bits [127:96] of the parameter \a X are exracted.
1055/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1056/// integer vector parameter and the remaining bits are assigned zeros.
1057#define _mm_extract_epi32(X, N) \
1058 ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1059
1060/// Extracts a 64-bit element from the 128-bit integer vector of
1061/// [2 x i64], using the immediate value parameter \a N as a selector.
1062///
1063/// \headerfile <x86intrin.h>
1064///
1065/// \code
1066/// long long _mm_extract_epi64(__m128i X, const int N);
1067/// \endcode
1068///
1069/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1070/// in 64-bit mode.
1071///
1072/// \param X
1073/// A 128-bit integer vector.
1074/// \param N
1075/// An immediate value. Bit [0] specifies which 64-bit vector element from
1076/// the argument \a X to return. \n
1077/// 0: Bits [63:0] are returned. \n
1078/// 1: Bits [127:64] are returned. \n
1079/// \returns A 64-bit integer.
1080#define _mm_extract_epi64(X, N) \
1081 ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1082
1083/* SSE4 128-bit Packed Integer Comparisons. */
1084/// Tests whether the specified bits in a 128-bit integer vector are all
1085/// zeros.
1086///
1087/// \headerfile <x86intrin.h>
1088///
1089/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1090///
1091/// \param __M
1092/// A 128-bit integer vector containing the bits to be tested.
1093/// \param __V
1094/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1095/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1096static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1097 __m128i __V) {
1098 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1099}
1100
1101/// Tests whether the specified bits in a 128-bit integer vector are all
1102/// ones.
1103///
1104/// \headerfile <x86intrin.h>
1105///
1106/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1107///
1108/// \param __M
1109/// A 128-bit integer vector containing the bits to be tested.
1110/// \param __V
1111/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1112/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1113static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1114 __m128i __V) {
1115 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1116}
1117
1118/// Tests whether the specified bits in a 128-bit integer vector are
1119/// neither all zeros nor all ones.
1120///
1121/// \headerfile <x86intrin.h>
1122///
1123/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1124///
1125/// \param __M
1126/// A 128-bit integer vector containing the bits to be tested.
1127/// \param __V
1128/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1129/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1130/// FALSE otherwise.
1131static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1132 __m128i __V) {
1133 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1134}
1135
1136/// Tests whether the specified bits in a 128-bit integer vector are all
1137/// ones.
1138///
1139/// \headerfile <x86intrin.h>
1140///
1141/// \code
1142/// int _mm_test_all_ones(__m128i V);
1143/// \endcode
1144///
1145/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1146///
1147/// \param V
1148/// A 128-bit integer vector containing the bits to be tested.
1149/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1150/// otherwise.
1151#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1152
1153/// Tests whether the specified bits in a 128-bit integer vector are
1154/// neither all zeros nor all ones.
1155///
1156/// \headerfile <x86intrin.h>
1157///
1158/// \code
1159/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1160/// \endcode
1161///
1162/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1163///
1164/// \param M
1165/// A 128-bit integer vector containing the bits to be tested.
1166/// \param V
1167/// A 128-bit integer vector selecting which bits to test in operand \a M.
1168/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1169/// FALSE otherwise.
1170#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1171
1172/// Tests whether the specified bits in a 128-bit integer vector are all
1173/// zeros.
1174///
1175/// \headerfile <x86intrin.h>
1176///
1177/// \code
1178/// int _mm_test_all_zeros(__m128i M, __m128i V);
1179/// \endcode
1180///
1181/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1182///
1183/// \param M
1184/// A 128-bit integer vector containing the bits to be tested.
1185/// \param V
1186/// A 128-bit integer vector selecting which bits to test in operand \a M.
1187/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1188#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1189
1190/* SSE4 64-bit Packed Integer Comparisons. */
1191/// Compares each of the corresponding 64-bit values of the 128-bit
1192/// integer vectors for equality.
1193///
1194/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1195///
1196/// \headerfile <x86intrin.h>
1197///
1198/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1199///
1200/// \param __V1
1201/// A 128-bit integer vector.
1202/// \param __V2
1203/// A 128-bit integer vector.
1204/// \returns A 128-bit integer vector containing the comparison results.
1205static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1206_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) {
1207 return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1208}
1209
1210/* SSE4 Packed Integer Sign-Extension. */
1211/// Sign-extends each of the lower eight 8-bit integer elements of a
1212/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1213/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1214/// are unused.
1215///
1216/// \headerfile <x86intrin.h>
1217///
1218/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1219///
1220/// \param __V
1221/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1222/// sign-extended to 16-bit values.
1223/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1224static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1225_mm_cvtepi8_epi16(__m128i __V) {
1226 /* This function always performs a signed extension, but __v16qi is a char
1227 which may be signed or unsigned, so use __v16qs. */
1228 return (__m128i) __builtin_convertvector(
1229 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1230 7),
1231 __v8hi);
1232}
1233
1234/// Sign-extends each of the lower four 8-bit integer elements of a
1235/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1236/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1237/// vector are unused.
1238///
1239/// \headerfile <x86intrin.h>
1240///
1241/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1242///
1243/// \param __V
1244/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1245/// sign-extended to 32-bit values.
1246/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1247static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1248_mm_cvtepi8_epi32(__m128i __V) {
1249 /* This function always performs a signed extension, but __v16qi is a char
1250 which may be signed or unsigned, so use __v16qs. */
1251 return (__m128i) __builtin_convertvector(
1252 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1253}
1254
1255/// Sign-extends each of the lower two 8-bit integer elements of a
1256/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1257/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1258/// vector are unused.
1259///
1260/// \headerfile <x86intrin.h>
1261///
1262/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1263///
1264/// \param __V
1265/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1266/// sign-extended to 64-bit values.
1267/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1268static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1269_mm_cvtepi8_epi64(__m128i __V) {
1270 /* This function always performs a signed extension, but __v16qi is a char
1271 which may be signed or unsigned, so use __v16qs. */
1272 return (__m128i) __builtin_convertvector(
1273 __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1274}
1275
1276/// Sign-extends each of the lower four 16-bit integer elements of a
1277/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1278/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1279/// vector are unused.
1280///
1281/// \headerfile <x86intrin.h>
1282///
1283/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1284///
1285/// \param __V
1286/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1287/// sign-extended to 32-bit values.
1288/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1289static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1291 return (__m128i) __builtin_convertvector(
1292 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1293}
1294
1295/// Sign-extends each of the lower two 16-bit integer elements of a
1296/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1297/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1298/// vector are unused.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1303///
1304/// \param __V
1305/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1306/// sign-extended to 64-bit values.
1307/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1308static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1310 return (__m128i) __builtin_convertvector(
1311 __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1312}
1313
1314/// Sign-extends each of the lower two 32-bit integer elements of a
1315/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1316/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1317/// are unused.
1318///
1319/// \headerfile <x86intrin.h>
1320///
1321/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1322///
1323/// \param __V
1324/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1325/// sign-extended to 64-bit values.
1326/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1327static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1329 return (__m128i) __builtin_convertvector(
1330 __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1331}
1332
1333/* SSE4 Packed Integer Zero-Extension. */
1334/// Zero-extends each of the lower eight 8-bit integer elements of a
1335/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1336/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1337/// are unused.
1338///
1339/// \headerfile <x86intrin.h>
1340///
1341/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1342///
1343/// \param __V
1344/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1345/// zero-extended to 16-bit values.
1346/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1347static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1348_mm_cvtepu8_epi16(__m128i __V) {
1349 return (__m128i) __builtin_convertvector(
1350 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1351 7),
1352 __v8hi);
1353}
1354
1355/// Zero-extends each of the lower four 8-bit integer elements of a
1356/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1357/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1358/// vector are unused.
1359///
1360/// \headerfile <x86intrin.h>
1361///
1362/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1363///
1364/// \param __V
1365/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1366/// zero-extended to 32-bit values.
1367/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1368static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1369_mm_cvtepu8_epi32(__m128i __V) {
1370 return (__m128i) __builtin_convertvector(
1371 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1372}
1373
1374/// Zero-extends each of the lower two 8-bit integer elements of a
1375/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1376/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1377/// vector are unused.
1378///
1379/// \headerfile <x86intrin.h>
1380///
1381/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1382///
1383/// \param __V
1384/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1385/// zero-extended to 64-bit values.
1386/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1387static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1388_mm_cvtepu8_epi64(__m128i __V) {
1389 return (__m128i) __builtin_convertvector(
1390 __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1391}
1392
1393/// Zero-extends each of the lower four 16-bit integer elements of a
1394/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1395/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1396/// vector are unused.
1397///
1398/// \headerfile <x86intrin.h>
1399///
1400/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1401///
1402/// \param __V
1403/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1404/// zero-extended to 32-bit values.
1405/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1406static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1408 return (__m128i) __builtin_convertvector(
1409 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1410}
1411
1412/// Zero-extends each of the lower two 16-bit integer elements of a
1413/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1414/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1415/// are unused.
1416///
1417/// \headerfile <x86intrin.h>
1418///
1419/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1420///
1421/// \param __V
1422/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1423/// zero-extended to 64-bit values.
1424/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1425static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1427 return (__m128i) __builtin_convertvector(
1428 __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1429}
1430
1431/// Zero-extends each of the lower two 32-bit integer elements of a
1432/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1433/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1434/// are unused.
1435///
1436/// \headerfile <x86intrin.h>
1437///
1438/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1439///
1440/// \param __V
1441/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1442/// zero-extended to 64-bit values.
1443/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1444static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1446 return (__m128i) __builtin_convertvector(
1447 __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1448}
1449
1450/* SSE4 Pack with Unsigned Saturation. */
1451/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1452/// vector operands into 16-bit unsigned integers, and returns the packed
1453/// result.
1454///
1455/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1456/// 0x0000 are saturated to 0x0000.
1457///
1458/// \headerfile <x86intrin.h>
1459///
1460/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1461///
1462/// \param __V1
1463/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1464/// written to the lower 64 bits of the result.
1465/// \param __V2
1466/// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1467/// written to the higher 64 bits of the result.
1468/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1469static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
1470_mm_packus_epi32(__m128i __V1, __m128i __V2) {
1471 return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1472}
1473
1474/* SSE4 Multiple Packed Sums of Absolute Difference. */
1475/// Subtracts 8-bit unsigned integer values and computes the absolute
1476/// values of the differences to the corresponding bits in the destination.
1477/// Then sums of the absolute differences are returned according to the bit
1478/// fields in the immediate operand.
1479///
1480/// \headerfile <x86intrin.h>
1481///
1482/// \code
1483/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1484/// \endcode
1485///
1486/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1487///
1488/// \param X
1489/// A 128-bit vector of [16 x i8].
1490/// \param Y
1491/// A 128-bit vector of [16 x i8].
1492/// \param M
1493/// An 8-bit immediate operand specifying how the absolute differences are to
1494/// be calculated, according to the following algorithm:
1495/// \code
1496/// // M2 represents bit 2 of the immediate operand
1497/// // M10 represents bits [1:0] of the immediate operand
1498/// i = M2 * 4;
1499/// j = M10 * 4;
1500/// for (k = 0; k < 8; k = k + 1) {
1501/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1502/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1503/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1504/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1505/// r[k] = d0 + d1 + d2 + d3;
1506/// }
1507/// \endcode
1508/// \returns A 128-bit integer vector containing the sums of the sets of
1509/// absolute differences between both operands.
1510#define _mm_mpsadbw_epu8(X, Y, M) \
1511 ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1512 (__v16qi)(__m128i)(Y), (M)))
1513
1514/// Finds the minimum unsigned 16-bit element in the input 128-bit
1515/// vector of [8 x u16] and returns it and along with its index.
1516///
1517/// \headerfile <x86intrin.h>
1518///
1519/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1520/// instruction.
1521///
1522/// \param __V
1523/// A 128-bit vector of [8 x u16].
1524/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1525/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1526/// and the remaining bits are set to 0.
1527static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1528 return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1529}
1530
1531/* Handle the sse4.2 definitions here. */
1532
1533/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1534 so we'll do the same. */
1535
1536#undef __DEFAULT_FN_ATTRS
1537#undef __DEFAULT_FN_ATTRS_CONSTEXPR
1538#define __DEFAULT_FN_ATTRS \
1539 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1540
1541#if defined(__cplusplus) && (__cplusplus >= 201103L)
1542#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
1543#else
1544#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
1545#endif
1546
1547/* These specify the type of data that we're comparing. */
1548#define _SIDD_UBYTE_OPS 0x00
1549#define _SIDD_UWORD_OPS 0x01
1550#define _SIDD_SBYTE_OPS 0x02
1551#define _SIDD_SWORD_OPS 0x03
1552
1553/* These specify the type of comparison operation. */
1554#define _SIDD_CMP_EQUAL_ANY 0x00
1555#define _SIDD_CMP_RANGES 0x04
1556#define _SIDD_CMP_EQUAL_EACH 0x08
1557#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1558
1559/* These macros specify the polarity of the operation. */
1560#define _SIDD_POSITIVE_POLARITY 0x00
1561#define _SIDD_NEGATIVE_POLARITY 0x10
1562#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1563#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1564
1565/* These macros are used in _mm_cmpXstri() to specify the return. */
1566#define _SIDD_LEAST_SIGNIFICANT 0x00
1567#define _SIDD_MOST_SIGNIFICANT 0x40
1568
1569/* These macros are used in _mm_cmpXstri() to specify the return. */
1570#define _SIDD_BIT_MASK 0x00
1571#define _SIDD_UNIT_MASK 0x40
1572
1573/* SSE4.2 Packed Comparison Intrinsics. */
1574/// Uses the immediate operand \a M to perform a comparison of string
1575/// data with implicitly defined lengths that is contained in source operands
1576/// \a A and \a B. Returns a 128-bit integer vector representing the result
1577/// mask of the comparison.
1578///
1579/// \headerfile <x86intrin.h>
1580///
1581/// \code
1582/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1583/// \endcode
1584///
1585/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1586/// instruction.
1587///
1588/// \param A
1589/// A 128-bit integer vector containing one of the source operands to be
1590/// compared.
1591/// \param B
1592/// A 128-bit integer vector containing one of the source operands to be
1593/// compared.
1594/// \param M
1595/// An 8-bit immediate operand specifying whether the characters are bytes or
1596/// words, the type of comparison to perform, and the format of the return
1597/// value. \n
1598/// Bits [1:0]: Determine source data format. \n
1599/// 00: 16 unsigned bytes \n
1600/// 01: 8 unsigned words \n
1601/// 10: 16 signed bytes \n
1602/// 11: 8 signed words \n
1603/// Bits [3:2]: Determine comparison type and aggregation method. \n
1604/// 00: Subset: Each character in \a B is compared for equality with all
1605/// the characters in \a A. \n
1606/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1607/// basis is greater than or equal for even-indexed elements in \a A,
1608/// and less than or equal for odd-indexed elements in \a A. \n
1609/// 10: Match: Compare each pair of corresponding characters in \a A and
1610/// \a B for equality. \n
1611/// 11: Substring: Search \a B for substring matches of \a A. \n
1612/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1613/// mask of the comparison results. \n
1614/// 00: No effect. \n
1615/// 01: Negate the bit mask. \n
1616/// 10: No effect. \n
1617/// 11: Negate the bit mask only for bits with an index less than or equal
1618/// to the size of \a A or \a B. \n
1619/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1620/// bytes. \n
1621/// 0: The result is zero-extended to 16 bytes. \n
1622/// 1: The result is expanded to 16 bytes (this expansion is performed by
1623/// repeating each bit 8 or 16 times).
1624/// \returns Returns a 128-bit integer vector representing the result mask of
1625/// the comparison.
1626#define _mm_cmpistrm(A, B, M) \
1627 ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1628 (__v16qi)(__m128i)(B), (int)(M)))
1629
1630/// Uses the immediate operand \a M to perform a comparison of string
1631/// data with implicitly defined lengths that is contained in source operands
1632/// \a A and \a B. Returns an integer representing the result index of the
1633/// comparison.
1634///
1635/// \headerfile <x86intrin.h>
1636///
1637/// \code
1638/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1639/// \endcode
1640///
1641/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1642/// instruction.
1643///
1644/// \param A
1645/// A 128-bit integer vector containing one of the source operands to be
1646/// compared.
1647/// \param B
1648/// A 128-bit integer vector containing one of the source operands to be
1649/// compared.
1650/// \param M
1651/// An 8-bit immediate operand specifying whether the characters are bytes or
1652/// words, the type of comparison to perform, and the format of the return
1653/// value. \n
1654/// Bits [1:0]: Determine source data format. \n
1655/// 00: 16 unsigned bytes \n
1656/// 01: 8 unsigned words \n
1657/// 10: 16 signed bytes \n
1658/// 11: 8 signed words \n
1659/// Bits [3:2]: Determine comparison type and aggregation method. \n
1660/// 00: Subset: Each character in \a B is compared for equality with all
1661/// the characters in \a A. \n
1662/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1663/// basis is greater than or equal for even-indexed elements in \a A,
1664/// and less than or equal for odd-indexed elements in \a A. \n
1665/// 10: Match: Compare each pair of corresponding characters in \a A and
1666/// \a B for equality. \n
1667/// 11: Substring: Search B for substring matches of \a A. \n
1668/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1669/// mask of the comparison results. \n
1670/// 00: No effect. \n
1671/// 01: Negate the bit mask. \n
1672/// 10: No effect. \n
1673/// 11: Negate the bit mask only for bits with an index less than or equal
1674/// to the size of \a A or \a B. \n
1675/// Bit [6]: Determines whether the index of the lowest set bit or the
1676/// highest set bit is returned. \n
1677/// 0: The index of the least significant set bit. \n
1678/// 1: The index of the most significant set bit. \n
1679/// \returns Returns an integer representing the result index of the comparison.
1680#define _mm_cmpistri(A, B, M) \
1681 ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1682 (__v16qi)(__m128i)(B), (int)(M)))
1683
1684/// Uses the immediate operand \a M to perform a comparison of string
1685/// data with explicitly defined lengths that is contained in source operands
1686/// \a A and \a B. Returns a 128-bit integer vector representing the result
1687/// mask of the comparison.
1688///
1689/// \headerfile <x86intrin.h>
1690///
1691/// \code
1692/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1693/// \endcode
1694///
1695/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1696/// instruction.
1697///
1698/// \param A
1699/// A 128-bit integer vector containing one of the source operands to be
1700/// compared.
1701/// \param LA
1702/// An integer that specifies the length of the string in \a A.
1703/// \param B
1704/// A 128-bit integer vector containing one of the source operands to be
1705/// compared.
1706/// \param LB
1707/// An integer that specifies the length of the string in \a B.
1708/// \param M
1709/// An 8-bit immediate operand specifying whether the characters are bytes or
1710/// words, the type of comparison to perform, and the format of the return
1711/// value. \n
1712/// Bits [1:0]: Determine source data format. \n
1713/// 00: 16 unsigned bytes \n
1714/// 01: 8 unsigned words \n
1715/// 10: 16 signed bytes \n
1716/// 11: 8 signed words \n
1717/// Bits [3:2]: Determine comparison type and aggregation method. \n
1718/// 00: Subset: Each character in \a B is compared for equality with all
1719/// the characters in \a A. \n
1720/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1721/// basis is greater than or equal for even-indexed elements in \a A,
1722/// and less than or equal for odd-indexed elements in \a A. \n
1723/// 10: Match: Compare each pair of corresponding characters in \a A and
1724/// \a B for equality. \n
1725/// 11: Substring: Search \a B for substring matches of \a A. \n
1726/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1727/// mask of the comparison results. \n
1728/// 00: No effect. \n
1729/// 01: Negate the bit mask. \n
1730/// 10: No effect. \n
1731/// 11: Negate the bit mask only for bits with an index less than or equal
1732/// to the size of \a A or \a B. \n
1733/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1734/// bytes. \n
1735/// 0: The result is zero-extended to 16 bytes. \n
1736/// 1: The result is expanded to 16 bytes (this expansion is performed by
1737/// repeating each bit 8 or 16 times). \n
1738/// \returns Returns a 128-bit integer vector representing the result mask of
1739/// the comparison.
1740#define _mm_cmpestrm(A, LA, B, LB, M) \
1741 ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1742 (__v16qi)(__m128i)(B), (int)(LB), \
1743 (int)(M)))
1744
1745/// Uses the immediate operand \a M to perform a comparison of string
1746/// data with explicitly defined lengths that is contained in source operands
1747/// \a A and \a B. Returns an integer representing the result index of the
1748/// comparison.
1749///
1750/// \headerfile <x86intrin.h>
1751///
1752/// \code
1753/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1754/// \endcode
1755///
1756/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1757/// instruction.
1758///
1759/// \param A
1760/// A 128-bit integer vector containing one of the source operands to be
1761/// compared.
1762/// \param LA
1763/// An integer that specifies the length of the string in \a A.
1764/// \param B
1765/// A 128-bit integer vector containing one of the source operands to be
1766/// compared.
1767/// \param LB
1768/// An integer that specifies the length of the string in \a B.
1769/// \param M
1770/// An 8-bit immediate operand specifying whether the characters are bytes or
1771/// words, the type of comparison to perform, and the format of the return
1772/// value. \n
1773/// Bits [1:0]: Determine source data format. \n
1774/// 00: 16 unsigned bytes \n
1775/// 01: 8 unsigned words \n
1776/// 10: 16 signed bytes \n
1777/// 11: 8 signed words \n
1778/// Bits [3:2]: Determine comparison type and aggregation method. \n
1779/// 00: Subset: Each character in \a B is compared for equality with all
1780/// the characters in \a A. \n
1781/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1782/// basis is greater than or equal for even-indexed elements in \a A,
1783/// and less than or equal for odd-indexed elements in \a A. \n
1784/// 10: Match: Compare each pair of corresponding characters in \a A and
1785/// \a B for equality. \n
1786/// 11: Substring: Search B for substring matches of \a A. \n
1787/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1788/// mask of the comparison results. \n
1789/// 00: No effect. \n
1790/// 01: Negate the bit mask. \n
1791/// 10: No effect. \n
1792/// 11: Negate the bit mask only for bits with an index less than or equal
1793/// to the size of \a A or \a B. \n
1794/// Bit [6]: Determines whether the index of the lowest set bit or the
1795/// highest set bit is returned. \n
1796/// 0: The index of the least significant set bit. \n
1797/// 1: The index of the most significant set bit. \n
1798/// \returns Returns an integer representing the result index of the comparison.
1799#define _mm_cmpestri(A, LA, B, LB, M) \
1800 ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1801 (__v16qi)(__m128i)(B), (int)(LB), \
1802 (int)(M)))
1803
1804/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1805/// Uses the immediate operand \a M to perform a comparison of string
1806/// data with implicitly defined lengths that is contained in source operands
1807/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1808/// string in \a B is the maximum, otherwise, returns 0.
1809///
1810/// \headerfile <x86intrin.h>
1811///
1812/// \code
1813/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1814/// \endcode
1815///
1816/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1817/// instruction.
1818///
1819/// \param A
1820/// A 128-bit integer vector containing one of the source operands to be
1821/// compared.
1822/// \param B
1823/// A 128-bit integer vector containing one of the source operands to be
1824/// compared.
1825/// \param M
1826/// An 8-bit immediate operand specifying whether the characters are bytes or
1827/// words and the type of comparison to perform. \n
1828/// Bits [1:0]: Determine source data format. \n
1829/// 00: 16 unsigned bytes \n
1830/// 01: 8 unsigned words \n
1831/// 10: 16 signed bytes \n
1832/// 11: 8 signed words \n
1833/// Bits [3:2]: Determine comparison type and aggregation method. \n
1834/// 00: Subset: Each character in \a B is compared for equality with all
1835/// the characters in \a A. \n
1836/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1837/// basis is greater than or equal for even-indexed elements in \a A,
1838/// and less than or equal for odd-indexed elements in \a A. \n
1839/// 10: Match: Compare each pair of corresponding characters in \a A and
1840/// \a B for equality. \n
1841/// 11: Substring: Search \a B for substring matches of \a A. \n
1842/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1843/// mask of the comparison results. \n
1844/// 00: No effect. \n
1845/// 01: Negate the bit mask. \n
1846/// 10: No effect. \n
1847/// 11: Negate the bit mask only for bits with an index less than or equal
1848/// to the size of \a A or \a B. \n
1849/// \returns Returns 1 if the bit mask is zero and the length of the string in
1850/// \a B is the maximum; otherwise, returns 0.
1851#define _mm_cmpistra(A, B, M) \
1852 ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1853 (__v16qi)(__m128i)(B), (int)(M)))
1854
1855/// Uses the immediate operand \a M to perform a comparison of string
1856/// data with implicitly defined lengths that is contained in source operands
1857/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1858/// 0.
1859///
1860/// \headerfile <x86intrin.h>
1861///
1862/// \code
1863/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1864/// \endcode
1865///
1866/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1867/// instruction.
1868///
1869/// \param A
1870/// A 128-bit integer vector containing one of the source operands to be
1871/// compared.
1872/// \param B
1873/// A 128-bit integer vector containing one of the source operands to be
1874/// compared.
1875/// \param M
1876/// An 8-bit immediate operand specifying whether the characters are bytes or
1877/// words and the type of comparison to perform. \n
1878/// Bits [1:0]: Determine source data format. \n
1879/// 00: 16 unsigned bytes \n
1880/// 01: 8 unsigned words \n
1881/// 10: 16 signed bytes \n
1882/// 11: 8 signed words \n
1883/// Bits [3:2]: Determine comparison type and aggregation method. \n
1884/// 00: Subset: Each character in \a B is compared for equality with all
1885/// the characters in \a A. \n
1886/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1887/// basis is greater than or equal for even-indexed elements in \a A,
1888/// and less than or equal for odd-indexed elements in \a A. \n
1889/// 10: Match: Compare each pair of corresponding characters in \a A and
1890/// \a B for equality. \n
1891/// 11: Substring: Search B for substring matches of \a A. \n
1892/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1893/// mask of the comparison results. \n
1894/// 00: No effect. \n
1895/// 01: Negate the bit mask. \n
1896/// 10: No effect. \n
1897/// 11: Negate the bit mask only for bits with an index less than or equal
1898/// to the size of \a A or \a B.
1899/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1900#define _mm_cmpistrc(A, B, M) \
1901 ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1902 (__v16qi)(__m128i)(B), (int)(M)))
1903
1904/// Uses the immediate operand \a M to perform a comparison of string
1905/// data with implicitly defined lengths that is contained in source operands
1906/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1907///
1908/// \headerfile <x86intrin.h>
1909///
1910/// \code
1911/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1912/// \endcode
1913///
1914/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1915/// instruction.
1916///
1917/// \param A
1918/// A 128-bit integer vector containing one of the source operands to be
1919/// compared.
1920/// \param B
1921/// A 128-bit integer vector containing one of the source operands to be
1922/// compared.
1923/// \param M
1924/// An 8-bit immediate operand specifying whether the characters are bytes or
1925/// words and the type of comparison to perform. \n
1926/// Bits [1:0]: Determine source data format. \n
1927/// 00: 16 unsigned bytes \n
1928/// 01: 8 unsigned words \n
1929/// 10: 16 signed bytes \n
1930/// 11: 8 signed words \n
1931/// Bits [3:2]: Determine comparison type and aggregation method. \n
1932/// 00: Subset: Each character in \a B is compared for equality with all
1933/// the characters in \a A. \n
1934/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1935/// basis is greater than or equal for even-indexed elements in \a A,
1936/// and less than or equal for odd-indexed elements in \a A. \n
1937/// 10: Match: Compare each pair of corresponding characters in \a A and
1938/// \a B for equality. \n
1939/// 11: Substring: Search B for substring matches of \a A. \n
1940/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1941/// mask of the comparison results. \n
1942/// 00: No effect. \n
1943/// 01: Negate the bit mask. \n
1944/// 10: No effect. \n
1945/// 11: Negate the bit mask only for bits with an index less than or equal
1946/// to the size of \a A or \a B. \n
1947/// \returns Returns bit 0 of the resulting bit mask.
1948#define _mm_cmpistro(A, B, M) \
1949 ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1950 (__v16qi)(__m128i)(B), (int)(M)))
1951
1952/// Uses the immediate operand \a M to perform a comparison of string
1953/// data with implicitly defined lengths that is contained in source operands
1954/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1955/// the maximum, otherwise, returns 0.
1956///
1957/// \headerfile <x86intrin.h>
1958///
1959/// \code
1960/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1961/// \endcode
1962///
1963/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1964/// instruction.
1965///
1966/// \param A
1967/// A 128-bit integer vector containing one of the source operands to be
1968/// compared.
1969/// \param B
1970/// A 128-bit integer vector containing one of the source operands to be
1971/// compared.
1972/// \param M
1973/// An 8-bit immediate operand specifying whether the characters are bytes or
1974/// words and the type of comparison to perform. \n
1975/// Bits [1:0]: Determine source data format. \n
1976/// 00: 16 unsigned bytes \n
1977/// 01: 8 unsigned words \n
1978/// 10: 16 signed bytes \n
1979/// 11: 8 signed words \n
1980/// Bits [3:2]: Determine comparison type and aggregation method. \n
1981/// 00: Subset: Each character in \a B is compared for equality with all
1982/// the characters in \a A. \n
1983/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1984/// basis is greater than or equal for even-indexed elements in \a A,
1985/// and less than or equal for odd-indexed elements in \a A. \n
1986/// 10: Match: Compare each pair of corresponding characters in \a A and
1987/// \a B for equality. \n
1988/// 11: Substring: Search \a B for substring matches of \a A. \n
1989/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1990/// mask of the comparison results. \n
1991/// 00: No effect. \n
1992/// 01: Negate the bit mask. \n
1993/// 10: No effect. \n
1994/// 11: Negate the bit mask only for bits with an index less than or equal
1995/// to the size of \a A or \a B. \n
1996/// \returns Returns 1 if the length of the string in \a A is less than the
1997/// maximum, otherwise, returns 0.
1998#define _mm_cmpistrs(A, B, M) \
1999 ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2000 (__v16qi)(__m128i)(B), (int)(M)))
2001
2002/// Uses the immediate operand \a M to perform a comparison of string
2003/// data with implicitly defined lengths that is contained in source operands
2004/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2005/// the maximum, otherwise, returns 0.
2006///
2007/// \headerfile <x86intrin.h>
2008///
2009/// \code
2010/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2011/// \endcode
2012///
2013/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2014/// instruction.
2015///
2016/// \param A
2017/// A 128-bit integer vector containing one of the source operands to be
2018/// compared.
2019/// \param B
2020/// A 128-bit integer vector containing one of the source operands to be
2021/// compared.
2022/// \param M
2023/// An 8-bit immediate operand specifying whether the characters are bytes or
2024/// words and the type of comparison to perform. \n
2025/// Bits [1:0]: Determine source data format. \n
2026/// 00: 16 unsigned bytes \n
2027/// 01: 8 unsigned words \n
2028/// 10: 16 signed bytes \n
2029/// 11: 8 signed words \n
2030/// Bits [3:2]: Determine comparison type and aggregation method. \n
2031/// 00: Subset: Each character in \a B is compared for equality with all
2032/// the characters in \a A. \n
2033/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2034/// basis is greater than or equal for even-indexed elements in \a A,
2035/// and less than or equal for odd-indexed elements in \a A. \n
2036/// 10: Match: Compare each pair of corresponding characters in \a A and
2037/// \a B for equality. \n
2038/// 11: Substring: Search \a B for substring matches of \a A. \n
2039/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2040/// mask of the comparison results. \n
2041/// 00: No effect. \n
2042/// 01: Negate the bit mask. \n
2043/// 10: No effect. \n
2044/// 11: Negate the bit mask only for bits with an index less than or equal
2045/// to the size of \a A or \a B.
2046/// \returns Returns 1 if the length of the string in \a B is less than the
2047/// maximum, otherwise, returns 0.
2048#define _mm_cmpistrz(A, B, M) \
2049 ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2050 (__v16qi)(__m128i)(B), (int)(M)))
2051
2052/// Uses the immediate operand \a M to perform a comparison of string
2053/// data with explicitly defined lengths that is contained in source operands
2054/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2055/// string in \a B is the maximum, otherwise, returns 0.
2056///
2057/// \headerfile <x86intrin.h>
2058///
2059/// \code
2060/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2061/// \endcode
2062///
2063/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2064/// instruction.
2065///
2066/// \param A
2067/// A 128-bit integer vector containing one of the source operands to be
2068/// compared.
2069/// \param LA
2070/// An integer that specifies the length of the string in \a A.
2071/// \param B
2072/// A 128-bit integer vector containing one of the source operands to be
2073/// compared.
2074/// \param LB
2075/// An integer that specifies the length of the string in \a B.
2076/// \param M
2077/// An 8-bit immediate operand specifying whether the characters are bytes or
2078/// words and the type of comparison to perform. \n
2079/// Bits [1:0]: Determine source data format. \n
2080/// 00: 16 unsigned bytes \n
2081/// 01: 8 unsigned words \n
2082/// 10: 16 signed bytes \n
2083/// 11: 8 signed words \n
2084/// Bits [3:2]: Determine comparison type and aggregation method. \n
2085/// 00: Subset: Each character in \a B is compared for equality with all
2086/// the characters in \a A. \n
2087/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2088/// basis is greater than or equal for even-indexed elements in \a A,
2089/// and less than or equal for odd-indexed elements in \a A. \n
2090/// 10: Match: Compare each pair of corresponding characters in \a A and
2091/// \a B for equality. \n
2092/// 11: Substring: Search \a B for substring matches of \a A. \n
2093/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2094/// mask of the comparison results. \n
2095/// 00: No effect. \n
2096/// 01: Negate the bit mask. \n
2097/// 10: No effect. \n
2098/// 11: Negate the bit mask only for bits with an index less than or equal
2099/// to the size of \a A or \a B.
2100/// \returns Returns 1 if the bit mask is zero and the length of the string in
2101/// \a B is the maximum, otherwise, returns 0.
2102#define _mm_cmpestra(A, LA, B, LB, M) \
2103 ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2104 (__v16qi)(__m128i)(B), (int)(LB), \
2105 (int)(M)))
2106
2107/// Uses the immediate operand \a M to perform a comparison of string
2108/// data with explicitly defined lengths that is contained in source operands
2109/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2110/// returns 0.
2111///
2112/// \headerfile <x86intrin.h>
2113///
2114/// \code
2115/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2116/// \endcode
2117///
2118/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2119/// instruction.
2120///
2121/// \param A
2122/// A 128-bit integer vector containing one of the source operands to be
2123/// compared.
2124/// \param LA
2125/// An integer that specifies the length of the string in \a A.
2126/// \param B
2127/// A 128-bit integer vector containing one of the source operands to be
2128/// compared.
2129/// \param LB
2130/// An integer that specifies the length of the string in \a B.
2131/// \param M
2132/// An 8-bit immediate operand specifying whether the characters are bytes or
2133/// words and the type of comparison to perform. \n
2134/// Bits [1:0]: Determine source data format. \n
2135/// 00: 16 unsigned bytes \n
2136/// 01: 8 unsigned words \n
2137/// 10: 16 signed bytes \n
2138/// 11: 8 signed words \n
2139/// Bits [3:2]: Determine comparison type and aggregation method. \n
2140/// 00: Subset: Each character in \a B is compared for equality with all
2141/// the characters in \a A. \n
2142/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2143/// basis is greater than or equal for even-indexed elements in \a A,
2144/// and less than or equal for odd-indexed elements in \a A. \n
2145/// 10: Match: Compare each pair of corresponding characters in \a A and
2146/// \a B for equality. \n
2147/// 11: Substring: Search \a B for substring matches of \a A. \n
2148/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2149/// mask of the comparison results. \n
2150/// 00: No effect. \n
2151/// 01: Negate the bit mask. \n
2152/// 10: No effect. \n
2153/// 11: Negate the bit mask only for bits with an index less than or equal
2154/// to the size of \a A or \a B. \n
2155/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2156#define _mm_cmpestrc(A, LA, B, LB, M) \
2157 ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2158 (__v16qi)(__m128i)(B), (int)(LB), \
2159 (int)(M)))
2160
2161/// Uses the immediate operand \a M to perform a comparison of string
2162/// data with explicitly defined lengths that is contained in source operands
2163/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2164///
2165/// \headerfile <x86intrin.h>
2166///
2167/// \code
2168/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2169/// \endcode
2170///
2171/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2172/// instruction.
2173///
2174/// \param A
2175/// A 128-bit integer vector containing one of the source operands to be
2176/// compared.
2177/// \param LA
2178/// An integer that specifies the length of the string in \a A.
2179/// \param B
2180/// A 128-bit integer vector containing one of the source operands to be
2181/// compared.
2182/// \param LB
2183/// An integer that specifies the length of the string in \a B.
2184/// \param M
2185/// An 8-bit immediate operand specifying whether the characters are bytes or
2186/// words and the type of comparison to perform. \n
2187/// Bits [1:0]: Determine source data format. \n
2188/// 00: 16 unsigned bytes \n
2189/// 01: 8 unsigned words \n
2190/// 10: 16 signed bytes \n
2191/// 11: 8 signed words \n
2192/// Bits [3:2]: Determine comparison type and aggregation method. \n
2193/// 00: Subset: Each character in \a B is compared for equality with all
2194/// the characters in \a A. \n
2195/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2196/// basis is greater than or equal for even-indexed elements in \a A,
2197/// and less than or equal for odd-indexed elements in \a A. \n
2198/// 10: Match: Compare each pair of corresponding characters in \a A and
2199/// \a B for equality. \n
2200/// 11: Substring: Search \a B for substring matches of \a A. \n
2201/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2202/// mask of the comparison results. \n
2203/// 00: No effect. \n
2204/// 01: Negate the bit mask. \n
2205/// 10: No effect. \n
2206/// 11: Negate the bit mask only for bits with an index less than or equal
2207/// to the size of \a A or \a B.
2208/// \returns Returns bit 0 of the resulting bit mask.
2209#define _mm_cmpestro(A, LA, B, LB, M) \
2210 ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2211 (__v16qi)(__m128i)(B), (int)(LB), \
2212 (int)(M)))
2213
2214/// Uses the immediate operand \a M to perform a comparison of string
2215/// data with explicitly defined lengths that is contained in source operands
2216/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2217/// the maximum, otherwise, returns 0.
2218///
2219/// \headerfile <x86intrin.h>
2220///
2221/// \code
2222/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2223/// \endcode
2224///
2225/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2226/// instruction.
2227///
2228/// \param A
2229/// A 128-bit integer vector containing one of the source operands to be
2230/// compared.
2231/// \param LA
2232/// An integer that specifies the length of the string in \a A.
2233/// \param B
2234/// A 128-bit integer vector containing one of the source operands to be
2235/// compared.
2236/// \param LB
2237/// An integer that specifies the length of the string in \a B.
2238/// \param M
2239/// An 8-bit immediate operand specifying whether the characters are bytes or
2240/// words and the type of comparison to perform. \n
2241/// Bits [1:0]: Determine source data format. \n
2242/// 00: 16 unsigned bytes \n
2243/// 01: 8 unsigned words \n
2244/// 10: 16 signed bytes \n
2245/// 11: 8 signed words \n
2246/// Bits [3:2]: Determine comparison type and aggregation method. \n
2247/// 00: Subset: Each character in \a B is compared for equality with all
2248/// the characters in \a A. \n
2249/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2250/// basis is greater than or equal for even-indexed elements in \a A,
2251/// and less than or equal for odd-indexed elements in \a A. \n
2252/// 10: Match: Compare each pair of corresponding characters in \a A and
2253/// \a B for equality. \n
2254/// 11: Substring: Search \a B for substring matches of \a A. \n
2255/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2256/// mask of the comparison results. \n
2257/// 00: No effect. \n
2258/// 01: Negate the bit mask. \n
2259/// 10: No effect. \n
2260/// 11: Negate the bit mask only for bits with an index less than or equal
2261/// to the size of \a A or \a B. \n
2262/// \returns Returns 1 if the length of the string in \a A is less than the
2263/// maximum, otherwise, returns 0.
2264#define _mm_cmpestrs(A, LA, B, LB, M) \
2265 ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2266 (__v16qi)(__m128i)(B), (int)(LB), \
2267 (int)(M)))
2268
2269/// Uses the immediate operand \a M to perform a comparison of string
2270/// data with explicitly defined lengths that is contained in source operands
2271/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2272/// the maximum, otherwise, returns 0.
2273///
2274/// \headerfile <x86intrin.h>
2275///
2276/// \code
2277/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2278/// \endcode
2279///
2280/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2281///
2282/// \param A
2283/// A 128-bit integer vector containing one of the source operands to be
2284/// compared.
2285/// \param LA
2286/// An integer that specifies the length of the string in \a A.
2287/// \param B
2288/// A 128-bit integer vector containing one of the source operands to be
2289/// compared.
2290/// \param LB
2291/// An integer that specifies the length of the string in \a B.
2292/// \param M
2293/// An 8-bit immediate operand specifying whether the characters are bytes or
2294/// words and the type of comparison to perform. \n
2295/// Bits [1:0]: Determine source data format. \n
2296/// 00: 16 unsigned bytes \n
2297/// 01: 8 unsigned words \n
2298/// 10: 16 signed bytes \n
2299/// 11: 8 signed words \n
2300/// Bits [3:2]: Determine comparison type and aggregation method. \n
2301/// 00: Subset: Each character in \a B is compared for equality with all
2302/// the characters in \a A. \n
2303/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2304/// basis is greater than or equal for even-indexed elements in \a A,
2305/// and less than or equal for odd-indexed elements in \a A. \n
2306/// 10: Match: Compare each pair of corresponding characters in \a A and
2307/// \a B for equality. \n
2308/// 11: Substring: Search \a B for substring matches of \a A. \n
2309/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2310/// mask of the comparison results. \n
2311/// 00: No effect. \n
2312/// 01: Negate the bit mask. \n
2313/// 10: No effect. \n
2314/// 11: Negate the bit mask only for bits with an index less than or equal
2315/// to the size of \a A or \a B.
2316/// \returns Returns 1 if the length of the string in \a B is less than the
2317/// maximum, otherwise, returns 0.
2318#define _mm_cmpestrz(A, LA, B, LB, M) \
2319 ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2320 (__v16qi)(__m128i)(B), (int)(LB), \
2321 (int)(M)))
2322
2323/* SSE4.2 Compare Packed Data -- Greater Than. */
2324/// Compares each of the corresponding 64-bit values of the 128-bit
2325/// integer vectors to determine if the values in the first operand are
2326/// greater than those in the second operand.
2327///
2328/// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2329///
2330/// \headerfile <x86intrin.h>
2331///
2332/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2333///
2334/// \param __V1
2335/// A 128-bit integer vector.
2336/// \param __V2
2337/// A 128-bit integer vector.
2338/// \returns A 128-bit integer vector containing the comparison results.
2339static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2340_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) {
2341 return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2342}
2343
2344#undef __DEFAULT_FN_ATTRS
2345#undef __DEFAULT_FN_ATTRS_CONSTEXPR
2346
2347#include <popcntintrin.h>
2348
2349#include <crc32intrin.h>
2350
2351#endif /* __SMMINTRIN_H */
#define __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS_CONSTEXPR
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_epi32(__m128i __V1, __m128i __V2)
Multiplies corresponding even-indexed elements of two 128-bit vectors of [4 x i32] and returns a 128-...
Definition smmintrin.h:562
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition smmintrin.h:706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128(const void *__V)
Loads integer values from a 128-bit aligned memory location to a 128-bit integer vector.
Definition smmintrin.h:651
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition smmintrin.h:760
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors to determine if the v...
Definition smmintrin.h:2340
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
Compares each of the corresponding 64-bit values of the 128-bit integer vectors for equality.
Definition smmintrin.h:1206
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition smmintrin.h:688
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_epi8(__m128i __V1, __m128i __V2, __m128i __M)
Returns a 128-bit vector of [16 x i8] where the values are selected from either of the first or secon...
Definition smmintrin.h:495
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_pd(__m128d __V1, __m128d __V2, __m128d __M)
Returns a 128-bit vector of [2 x double] where the values are selected from either the first or secon...
Definition smmintrin.h:443
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_blendv_ps(__m128 __V1, __m128 __V2, __m128 __M)
Returns a 128-bit vector of [4 x float] where the values are selected from either the first or second...
Definition smmintrin.h:469
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi16(__m128i __V)
Zero-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition smmintrin.h:1348
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi64(__m128i __V)
Zero-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition smmintrin.h:1426
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition smmintrin.h:796
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu32_epi64(__m128i __V)
Zero-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition smmintrin.h:1445
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_packus_epi32(__m128i __V1, __m128i __V2)
Converts, with saturation, 32-bit signed integers from both 128-bit integer vector operands into 16-b...
Definition smmintrin.h:1470
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi32(__m128i __V)
Sign-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition smmintrin.h:1248
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi64(__m128i __V)
Sign-extends each of the lower two 16-bit integer elements of a 128-bit integer vector of [8 x i16] t...
Definition smmintrin.h:1309
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi32(__m128i __V)
Zero-extends each of the lower four 8-bit integer elements of a 128-bit vector of [16 x i8] to 32-bit...
Definition smmintrin.h:1369
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi64(__m128i __V)
Sign-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition smmintrin.h:1269
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are neither all zeros nor all ones.
Definition smmintrin.h:1131
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epi32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x i32] and returns a 128-bit vector ...
Definition smmintrin.h:742
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all ones.
Definition smmintrin.h:1113
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mullo_epi32(__m128i __V1, __m128i __V2)
Multiples corresponding elements of two 128-bit vectors of [4 x i32] and returns the lower 32 bits of...
Definition smmintrin.h:543
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi16_epi32(__m128i __V)
Sign-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition smmintrin.h:1290
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epu32(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [4 x u32] and returns a 128-bit vector ...
Definition smmintrin.h:778
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi32_epi64(__m128i __V)
Sign-extends each of the lower two 32-bit integer elements of a 128-bit integer vector of [4 x i32] t...
Definition smmintrin.h:1328
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu16_epi32(__m128i __V)
Zero-extends each of the lower four 16-bit integer elements of a 128-bit integer vector of [8 x i16] ...
Definition smmintrin.h:1407
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_max_epu16(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [8 x u16] and returns a 128-bit vector ...
Definition smmintrin.h:724
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepi8_epi16(__m128i __V)
Sign-extends each of the lower eight 8-bit integer elements of a 128-bit vector of [16 x i8] to 16-bi...
Definition smmintrin.h:1225
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtepu8_epi64(__m128i __V)
Zero-extends each of the lower two 8-bit integer elements of a 128-bit integer vector of [16 x i8] to...
Definition smmintrin.h:1388
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V)
Finds the minimum unsigned 16-bit element in the input 128-bit vector of [8 x u16] and returns it and...
Definition smmintrin.h:1527
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V)
Tests whether the specified bits in a 128-bit integer vector are all zeros.
Definition smmintrin.h:1096
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_min_epi8(__m128i __V1, __m128i __V2)
Compares the corresponding elements of two 128-bit vectors of [16 x i8] and returns a 128-bit vector ...
Definition smmintrin.h:670